update
This commit is contained in:
		| @@ -27,11 +27,11 @@ for article in all_article: | |||||||
|                 href = 'https://www.nature.com'+ href |                 href = 'https://www.nature.com'+ href | ||||||
|             if href not in match_href and re.search('\?', href)==None:  # 链接不重复 |             if href not in match_href and re.search('\?', href)==None:  # 链接不重复 | ||||||
|                 match_href.append(href) |                 match_href.append(href) | ||||||
|                 f.write('<p><a target=\"_blank\" href=\"') |                 f.write('<li><a target=\"_blank\" href=\"') | ||||||
|                 f.write(href)   # 文章链接 |                 f.write(href)   # 文章链接 | ||||||
|                 f.write('\">') |                 f.write('\">') | ||||||
|                 f.write(a_tag.get_text()) |                 f.write(a_tag.get_text()) | ||||||
|                 f.write('</a>  ') |                 f.write('</a>  ') | ||||||
|     time = article.find('time', {"class": "c-meta__item c-meta__item--block-at-lg"}).get_text() |     time = article.find('time', {"class": "c-meta__item c-meta__item--block-at-lg"}).get_text() | ||||||
|     f.write(time+'</p>') |     f.write(time+'</li>') | ||||||
| f.close() | f.close() | ||||||
| @@ -26,11 +26,11 @@ for article in all_articles: | |||||||
|                 href = 'https://physics.aps.org'+ href |                 href = 'https://physics.aps.org'+ href | ||||||
|             if href not in match_href: |             if href not in match_href: | ||||||
|                 match_href.append(href) |                 match_href.append(href) | ||||||
|                 f.write('<p><a target=\"_blank\" href=\"') |                 f.write('<li><a target=\"_blank\" href=\"') | ||||||
|                 f.write(href)   # 文章链接 |                 f.write(href)   # 文章链接 | ||||||
|                 f.write('\">') |                 f.write('\">') | ||||||
|                 f.write(a_tag.get_text()) |                 f.write(a_tag.get_text()) | ||||||
|                 f.write('</a>  ') |                 f.write('</a>  ') | ||||||
|     time = article.find('time', {"class": "feed-item-date"}).get_text() |     time = article.find('time', {"class": "feed-item-date"}).get_text() | ||||||
|     f.write(time+'</p>') |     f.write(time+'</li>') | ||||||
| f.close() | f.close() | ||||||
| @@ -31,12 +31,12 @@ for loop in range(1): | |||||||
|                     href = 'https://journals.aps.org'+ href |                     href = 'https://journals.aps.org'+ href | ||||||
|                 if href not in match_href and re.search('\?', href)==None:  # 链接不重复 |                 if href not in match_href and re.search('\?', href)==None:  # 链接不重复 | ||||||
|                     match_href.append(href) |                     match_href.append(href) | ||||||
|                     f.write('<p><a target=\"_blank\" href=\"') |                     f.write('<li><a target=\"_blank\" href=\"') | ||||||
|                     f.write(href)   # 文章链接 |                     f.write(href)   # 文章链接 | ||||||
|                     f.write('\">') |                     f.write('\">') | ||||||
|                     f.write(a_tag.get_text()) |                     f.write(a_tag.get_text()) | ||||||
|                     f.write('</a>  ') |                     f.write('</a>  ') | ||||||
|         info = article.find('h6', {"class": "pub-info"}).get_text() |         info = article.find('h6', {"class": "pub-info"}).get_text() | ||||||
|         f.write(re.findall('– Published .*', info, re.S)[0][12:]+'</p>') |         f.write(re.findall('– Published .*', info, re.S)[0][12:]+'</li>') | ||||||
| f.close() | f.close() | ||||||
|  |  | ||||||
|   | |||||||
| @@ -31,12 +31,12 @@ for loop in range(1): | |||||||
|                     href = 'https://journals.aps.org'+ href |                     href = 'https://journals.aps.org'+ href | ||||||
|                 if href not in match_href and re.search('\?', href)==None:  # 链接不重复 |                 if href not in match_href and re.search('\?', href)==None:  # 链接不重复 | ||||||
|                     match_href.append(href) |                     match_href.append(href) | ||||||
|                     f.write('<p><a target=\"_blank\" href=\"') |                     f.write('<li><a target=\"_blank\" href=\"') | ||||||
|                     f.write(href)   # 文章链接 |                     f.write(href)   # 文章链接 | ||||||
|                     f.write('\">') |                     f.write('\">') | ||||||
|                     f.write(a_tag.get_text()) |                     f.write(a_tag.get_text()) | ||||||
|                     f.write('</a>  ') |                     f.write('</a>  ') | ||||||
|         info = article.find('h6', {"class": "pub-info"}).get_text() |         info = article.find('h6', {"class": "pub-info"}).get_text() | ||||||
|         f.write(re.findall('– Published.*', info, re.S)[0][12:]+'</p>') |         f.write(re.findall('– Published.*', info, re.S)[0][12:]+'</li>') | ||||||
| f.close() | f.close() | ||||||
|  |  | ||||||
|   | |||||||
| @@ -15,7 +15,7 @@ day = datetime.datetime.now().day | |||||||
| # 获取链接 | # 获取链接 | ||||||
| match_href = [] | match_href = [] | ||||||
| # 由于没有模拟登录知乎,因此只能爬取到最新的两篇文章 | # 由于没有模拟登录知乎,因此只能爬取到最新的两篇文章 | ||||||
| authors = ["https://www.zhihu.com/people/g3508/posts", # Guan | authors = ["https://www.zhihu.com/people/guanjihuan/posts", # Guan | ||||||
| ] | ] | ||||||
| for i0 in range(len(authors)): | for i0 in range(len(authors)): | ||||||
|     start_link = authors[i0] |     start_link = authors[i0] | ||||||
| @@ -47,7 +47,7 @@ for href in match_href_new: | |||||||
|         html = urlopen(href).read().decode('utf-8')   # 打开文章链接 |         html = urlopen(href).read().decode('utf-8')   # 打开文章链接 | ||||||
|         soup = BeautifulSoup(html, features='lxml') # 放入soup中 |         soup = BeautifulSoup(html, features='lxml') # 放入soup中 | ||||||
|         title = soup.title   # 文章标题 |         title = soup.title   # 文章标题 | ||||||
|         f.write('<p><a target=\"_blank\" href=\"') |         f.write('<li><a target=\"_blank\" href=\"') | ||||||
|         f.write(str(href))   # 文章链接 |         f.write(str(href))   # 文章链接 | ||||||
|         f.write('\">') |         f.write('\">') | ||||||
|         f.write(str(title.get_text()[:-5])) |         f.write(str(title.get_text()[:-5])) | ||||||
| @@ -55,7 +55,7 @@ for href in match_href_new: | |||||||
|         author = soup.find("span", {"class": "UserLink AuthorInfo-name"}) |         author = soup.find("span", {"class": "UserLink AuthorInfo-name"}) | ||||||
|         f.write(str(author.get_text()+'  ')) |         f.write(str(author.get_text()+'  ')) | ||||||
|         post_time = soup.find("div", {"class" : "ContentItem-time"}) |         post_time = soup.find("div", {"class" : "ContentItem-time"}) | ||||||
|         f.write(str(post_time.get_text()[4:-6])+'</p>') |         f.write(str(post_time.get_text()[4:-6])+'</li>') | ||||||
|     except: |     except: | ||||||
|         pass |         pass | ||||||
| f.close() | f.close() | ||||||
		Reference in New Issue
	
	Block a user