diff --git a/language_learning/2021.11.17_zhihu/nature_physics.py b/language_learning/2021.11.17_zhihu/nature_physics.py index fe158a7..dfdb260 100644 --- a/language_learning/2021.11.17_zhihu/nature_physics.py +++ b/language_learning/2021.11.17_zhihu/nature_physics.py @@ -27,11 +27,11 @@ for article in all_article: href = 'https://www.nature.com'+ href if href not in match_href and re.search('\?', href)==None: # 链接不重复 match_href.append(href) - f.write('
') f.write(a_tag.get_text()) f.write(' ') time = article.find('time', {"class": "c-meta__item c-meta__item--block-at-lg"}).get_text() - f.write(time+'
') + f.write(time+'') f.close() \ No newline at end of file diff --git a/language_learning/2021.11.17_zhihu/physics_magazine.py b/language_learning/2021.11.17_zhihu/physics_magazine.py index f7612d3..e0efc6c 100644 --- a/language_learning/2021.11.17_zhihu/physics_magazine.py +++ b/language_learning/2021.11.17_zhihu/physics_magazine.py @@ -26,11 +26,11 @@ for article in all_articles: href = 'https://physics.aps.org'+ href if href not in match_href: match_href.append(href) - f.write('') f.write(a_tag.get_text()) f.write(' ') time = article.find('time', {"class": "feed-item-date"}).get_text() - f.write(time+'
') + f.write(time+'') f.close() \ No newline at end of file diff --git a/language_learning/2021.11.17_zhihu/prb.py b/language_learning/2021.11.17_zhihu/prb.py index 3a3c624..04c647d 100644 --- a/language_learning/2021.11.17_zhihu/prb.py +++ b/language_learning/2021.11.17_zhihu/prb.py @@ -31,12 +31,12 @@ for loop in range(1): href = 'https://journals.aps.org'+ href if href not in match_href and re.search('\?', href)==None: # 链接不重复 match_href.append(href) - f.write('') f.write(a_tag.get_text()) f.write(' ') info = article.find('h6', {"class": "pub-info"}).get_text() - f.write(re.findall('– Published .*', info, re.S)[0][12:]+'
') + f.write(re.findall('– Published .*', info, re.S)[0][12:]+'') f.close() diff --git a/language_learning/2021.11.17_zhihu/prl.py b/language_learning/2021.11.17_zhihu/prl.py index 2deacce..fb1f39a 100644 --- a/language_learning/2021.11.17_zhihu/prl.py +++ b/language_learning/2021.11.17_zhihu/prl.py @@ -31,12 +31,12 @@ for loop in range(1): href = 'https://journals.aps.org'+ href if href not in match_href and re.search('\?', href)==None: # 链接不重复 match_href.append(href) - f.write('') f.write(a_tag.get_text()) f.write(' ') info = article.find('h6', {"class": "pub-info"}).get_text() - f.write(re.findall('– Published.*', info, re.S)[0][12:]+'
') + f.write(re.findall('– Published.*', info, re.S)[0][12:]+'') f.close() diff --git a/language_learning/2021.11.17_zhihu/zhihu.py b/language_learning/2021.11.17_zhihu/zhihu.py index 14a9345..e2b1b9b 100644 --- a/language_learning/2021.11.17_zhihu/zhihu.py +++ b/language_learning/2021.11.17_zhihu/zhihu.py @@ -15,7 +15,7 @@ day = datetime.datetime.now().day # 获取链接 match_href = [] # 由于没有模拟登录知乎,因此只能爬取到最新的两篇文章 -authors = ["https://www.zhihu.com/people/g3508/posts", # Guan +authors = ["https://www.zhihu.com/people/guanjihuan/posts", # Guan ] for i0 in range(len(authors)): start_link = authors[i0] @@ -47,7 +47,7 @@ for href in match_href_new: html = urlopen(href).read().decode('utf-8') # 打开文章链接 soup = BeautifulSoup(html, features='lxml') # 放入soup中 title = soup.title # 文章标题 - f.write('') f.write(str(title.get_text()[:-5])) @@ -55,7 +55,7 @@ for href in match_href_new: author = soup.find("span", {"class": "UserLink AuthorInfo-name"}) f.write(str(author.get_text()+' ')) post_time = soup.find("div", {"class" : "ContentItem-time"}) - f.write(str(post_time.get_text()[4:-6])+'
') + f.write(str(post_time.get_text()[4:-6])+'') except: pass f.close() \ No newline at end of file