From 67d4c9e5a49758cd0970486633f4f81dcf68a611 Mon Sep 17 00:00:00 2001 From: guanjihuan Date: Wed, 23 Oct 2024 03:50:03 +0800 Subject: [PATCH] update --- .../collect_prb_abstracts.py | 88 ------------------- .../collect_prl_abstracts.py | 88 ------------------- .../count_words.py | 41 --------- 2021.11.17_zhihu/nature_physics.py | 37 -------- 2021.11.17_zhihu/physics_magazine.py | 36 -------- 2021.11.17_zhihu/prb.py | 42 --------- 2021.11.17_zhihu/prl.py | 42 --------- 2021.11.17_zhihu/zhihu.py | 66 -------------- 8 files changed, 440 deletions(-) delete mode 100644 2021.06.07_find_common_words_in_APS_abstracts/collect_prb_abstracts.py delete mode 100644 2021.06.07_find_common_words_in_APS_abstracts/collect_prl_abstracts.py delete mode 100644 2021.06.07_find_common_words_in_APS_abstracts/count_words.py delete mode 100644 2021.11.17_zhihu/nature_physics.py delete mode 100644 2021.11.17_zhihu/physics_magazine.py delete mode 100644 2021.11.17_zhihu/prb.py delete mode 100644 2021.11.17_zhihu/prl.py delete mode 100644 2021.11.17_zhihu/zhihu.py diff --git a/2021.06.07_find_common_words_in_APS_abstracts/collect_prb_abstracts.py b/2021.06.07_find_common_words_in_APS_abstracts/collect_prb_abstracts.py deleted file mode 100644 index 633fdb1..0000000 --- a/2021.06.07_find_common_words_in_APS_abstracts/collect_prb_abstracts.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623 -""" - -from bs4 import BeautifulSoup -from urllib.request import urlopen -import re -from collections import Counter -import datetime -import random -import time - - -# time.sleep(random.uniform(0,1800)) # 爬虫简单伪装,在固定时间后0到30分钟后开始运行。调试的时候把该语句注释。 -year = datetime.datetime.now().year -month = datetime.datetime.now().month -day = datetime.datetime.now().day - - -# 获取链接 -try: - with open('prb_link_list.txt', 'r', encoding='UTF-8') as f: # 如果文件存在 - link_list = f.read().split('\n') # 历史已经访问过的链接(数组类型) -except: - with open('prb_link_list.txt', 'w', encoding='UTF-8') as f: # 如果文件不存在 - link_list = [] -f = open('prb_link_list.txt', 'a', encoding='UTF-8') # 打开文件(补充) -f.write('\nLink list obtained on '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+':\n') -match_href = [] # 在本次运行中满足条件的链接 -for loop in range(3): - if loop == 0: - start_link = "https://journals.aps.org/prb/recent?page=1" # 看第一页 - elif loop == 1: - start_link = "https://journals.aps.org/prb/recent?page=2" # 看第二页 - elif loop == 2: - start_link = "https://journals.aps.org/prb/recent?page=3" # 看第三页(三页基本上覆盖了当天的所有更新) - html = urlopen(start_link).read().decode('utf-8') # 打开网页 - soup = BeautifulSoup(html, features='lxml') # 放入soup中 - all_a_tag = soup.find_all('a', href=True) # 获取超链接标签 - for a_tag in all_a_tag: - href = a_tag['href'] # 超链接字符串 - if re.search('/abstract/', href): # 文章的链接 - if re.search('https://journals.aps.org', href)==None: # 如果链接不是完整的,那么补充完整 - href = 'https://journals.aps.org'+ href - if href not in match_href and href not in link_list and re.search('\?', href)==None: # 链接不重复 - match_href.append(href) - f.write(href+'\n') -f.close() - - - -# 获取摘要 -try: - f = open('prb_all.txt', 'a', encoding='UTF-8') # 全部记录 -except: - f = open('prb_all.txt', 'w', encoding='UTF-8') # 如果文件不存在 -try: - f_month = open('prb_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'a', encoding='UTF-8') # 一个月的记录 -except: - f_month = open('prb_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'w', encoding='UTF-8') # 如果文件不存在 -f.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n') -f_month.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n') -print('total number=', len(match_href)) # 调试的时候显示这个 -i00 = 0 -for href in match_href: - i00 += 1 - print('reading number', i00, '...') # 调试的时候显示这个 - # time.sleep(random.uniform(10,110)) # 爬虫简单伪装,休息一分钟左右。如果链接个数有60个,那么程序运行时间延长60分钟。调试的时候把该语句注释。 - try: - html = urlopen(href).read().decode('utf-8') # 打开文章链接 - soup = BeautifulSoup(html, features='lxml') # 放入soup中 - title = soup.title # 文章标题 - f.write(str(title.get_text())+'\n\n') - f_month.write(str(title.get_text())+'\n\n') - f.write(str(href)+'\n\n') # 文章链接 - f_month.write(str(href)+'\n\n') - abstract = re.findall('"yes">

.*

', word)==None: # 有些内容满足过滤条件,因此信息可能会丢失。 - f.write(word+' ') - f_month.write(word+' ') - f.write('\n\n\n') - f_month.write('\n\n\n') - except: - pass -f.close() \ No newline at end of file diff --git a/2021.06.07_find_common_words_in_APS_abstracts/collect_prl_abstracts.py b/2021.06.07_find_common_words_in_APS_abstracts/collect_prl_abstracts.py deleted file mode 100644 index f7ef4e0..0000000 --- a/2021.06.07_find_common_words_in_APS_abstracts/collect_prl_abstracts.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623 -""" - -from bs4 import BeautifulSoup -from urllib.request import urlopen -import re -from collections import Counter -import datetime -import random -import time - - -# time.sleep(random.uniform(0,1800)) # 爬虫简单伪装,在固定时间后0到30分钟后开始运行。调试的时候把该语句注释。 -year = datetime.datetime.now().year -month = datetime.datetime.now().month -day = datetime.datetime.now().day - - -# 获取链接 -try: - with open('prl_link_list.txt', 'r', encoding='UTF-8') as f: # 如果文件存在 - link_list = f.read().split('\n') # 历史已经访问过的链接(数组类型) -except: - with open('prl_link_list.txt', 'w', encoding='UTF-8') as f: # 如果文件不存在 - link_list = [] -f = open('prl_link_list.txt', 'a', encoding='UTF-8') # 打开文件(补充) -f.write('\nLink list obtained on '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+':\n') -match_href = [] # 在本次运行中满足条件的链接 -for loop in range(3): - if loop == 0: - start_link = "https://journals.aps.org/prl/recent?page=1" # 看第一页 - elif loop == 1: - start_link = "https://journals.aps.org/prl/recent?page=2" # 看第二页 - elif loop == 2: - start_link = "https://journals.aps.org/prl/recent?page=3" # 看第三页(三页基本上覆盖了当天的所有更新) - html = urlopen(start_link).read().decode('utf-8') # 打开网页 - soup = BeautifulSoup(html, features='lxml') # 放入soup中 - all_a_tag = soup.find_all('a', href=True) # 获取超链接标签 - for a_tag in all_a_tag: - href = a_tag['href'] # 超链接字符串 - if re.search('/abstract/', href): # 文章的链接 - if re.search('https://journals.aps.org', href)==None: # 如果链接不是完整的,那么补充完整 - href = 'https://journals.aps.org'+ href - if href not in match_href and href not in link_list and re.search('\?', href)==None: # 链接不重复 - match_href.append(href) - f.write(href+'\n') -f.close() - - - -# 获取摘要 -try: - f = open('prl_all.txt', 'a', encoding='UTF-8') # 全部记录 -except: - f = open('prl_all.txt', 'w', encoding='UTF-8') # 如果文件不存在 -try: - f_month = open('prl_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'a', encoding='UTF-8') # 一个月的记录 -except: - f_month = open('prl_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'w', encoding='UTF-8') # 如果文件不存在 -f.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n') -f_month.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n') -print('total number=', len(match_href)) # 调试的时候显示这个 -i00 = 0 -for href in match_href: - i00 += 1 - print('reading number', i00, '...') # 调试的时候显示这个 - # time.sleep(random.uniform(10,110)) # 爬虫简单伪装,休息一分钟左右。如果链接个数有60个,那么程序运行时间延长60分钟。调试的时候把该语句注释。 - try: - html = urlopen(href).read().decode('utf-8') # 打开文章链接 - soup = BeautifulSoup(html, features='lxml') # 放入soup中 - title = soup.title # 文章标题 - f.write(str(title.get_text())+'\n\n') - f_month.write(str(title.get_text())+'\n\n') - f.write(str(href)+'\n\n') # 文章链接 - f_month.write(str(href)+'\n\n') - abstract = re.findall('"yes">

.*

', word)==None: # 有些内容满足过滤条件,因此信息可能会丢失。 - f.write(word+' ') - f_month.write(word+' ') - f.write('\n\n\n') - f_month.write('\n\n\n') - except: - pass -f.close() \ No newline at end of file diff --git a/2021.06.07_find_common_words_in_APS_abstracts/count_words.py b/2021.06.07_find_common_words_in_APS_abstracts/count_words.py deleted file mode 100644 index b3d179b..0000000 --- a/2021.06.07_find_common_words_in_APS_abstracts/count_words.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623 -""" - -import re -from collections import Counter - - -def main(): - file_name = 'prb_all.txt' - with open(file_name, 'r', encoding='UTF-8') as f: # 打开文件 - paper_list = f.read().split('\n\n\n') # 通过三个回车划分不同文章 - word_list = [] - ignore = ignore_words() # 过滤常见单词 - for paper in paper_list: - word_list_in_one_paper = [] - if len(paper)>20: # 通过字符串长度过滤日期 - content_list = paper.split('\n\n') # 通过两个回车划分内容 - for content in content_list: - if re.search('https://', content)==None: # 过滤文章链接 - words = content.split(' ') # 通过空格划分单词 - for word in words: - if word not in word_list_in_one_paper: # 一篇文章的某个单词只统计一次 - if word not in ignore and len(word)>1: # 过滤词汇 - word_list.append(word) - word_list_in_one_paper.append(word) - num = 300 - most_common_words = Counter(word_list).most_common(num) # 统计出现最多的num个词汇 - print('\n出现频率最高的前', num, '个词汇:') - for word in most_common_words: - print(word) - - -def ignore_words(): # 可自行增删 - ignore = ['Phys.', 'the', 'to', 'of', 'in', 'under', 'and', 'by', 'The', 'at', 'with', 'up', 'be', 'above', 'below', 'are', 'is', 'for', 'that', 'as', 'we', '') -f.write('

'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新

') - -match_href = [] -start_link = "https://www.nature.com/nphys/research-articles" -html = urlopen(start_link).read().decode('utf-8') # 打开网页 -soup = BeautifulSoup(html, features='lxml') # 放入soup中 -all_article = soup.find_all('article', {"class":"u-full-height c-card c-card--flush"}) -for article in all_article: - all_a_tag = article.find_all('a', href=True) # 获取超链接标签 - for a_tag in all_a_tag: - href = a_tag['href'] # 超链接字符串 - if re.search('/articles/', href): # 文章的链接 - if re.search('https://www.nature.com', href)==None: # 如果链接不是完整的,那么补充完整 - href = 'https://www.nature.com'+ href - if href not in match_href and re.search('\?', href)==None: # 链接不重复 - match_href.append(href) - f.write('
  • ') - f.write(a_tag.get_text()) - f.write('  ') - time = article.find('time', {"class": "c-meta__item c-meta__item--block-at-lg"}).get_text() - f.write(time+'
  • ') -f.close() \ No newline at end of file diff --git a/2021.11.17_zhihu/physics_magazine.py b/2021.11.17_zhihu/physics_magazine.py deleted file mode 100644 index e0efc6c..0000000 --- a/2021.11.17_zhihu/physics_magazine.py +++ /dev/null @@ -1,36 +0,0 @@ -from bs4 import BeautifulSoup -from urllib.request import urlopen -import re -import datetime - - -year = datetime.datetime.now().year -month = datetime.datetime.now().month -day = datetime.datetime.now().day - -f = open('physics_magazine.html', 'w', encoding='UTF-8') -f.write('') -f.write('

    '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新

    ') - -match_href = [] -start_link = "https://physics.aps.org/" -html = urlopen(start_link).read().decode('utf-8') # 打开网页 -soup = BeautifulSoup(html, features='lxml') # 放入soup中 -all_articles = soup.find_all('div', {"class":"feed-item-details"}) -for article in all_articles: - all_a_tag = article.find_all('a', href=True) # 获取超链接标签 - for a_tag in all_a_tag: - href = a_tag['href'] # 超链接字符串 - if re.search('/articles/', href): # 文章的链接 - if re.search('https://physics.aps.org', href)==None: # 如果链接不是完整的,那么补充完整 - href = 'https://physics.aps.org'+ href - if href not in match_href: - match_href.append(href) - f.write('
  • ') - f.write(a_tag.get_text()) - f.write('  ') - time = article.find('time', {"class": "feed-item-date"}).get_text() - f.write(time+'
  • ') -f.close() \ No newline at end of file diff --git a/2021.11.17_zhihu/prb.py b/2021.11.17_zhihu/prb.py deleted file mode 100644 index 04c647d..0000000 --- a/2021.11.17_zhihu/prb.py +++ /dev/null @@ -1,42 +0,0 @@ -from bs4 import BeautifulSoup -from urllib.request import urlopen -import re -import datetime - - -year = datetime.datetime.now().year -month = datetime.datetime.now().month -day = datetime.datetime.now().day - - -f = open('prb.html', 'w', encoding='UTF-8') -f.write('') -f.write('

    '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新

    ') - -match_href = [] -for loop in range(1): - if loop == 0: - start_link = "https://journals.aps.org/prb/recent" # 看第一页 - # elif loop == 1: - # start_link = "https://journals.aps.org/prb/recent?page=2" # 看第二页 - html = urlopen(start_link).read().decode('utf-8') # 打开网页 - soup = BeautifulSoup(html, features='lxml') # 放入soup中 - all_article = soup.find_all('div', {"class":"article panel article-result"}) - for article in all_article: - all_a_tag = article.find_all('a', href=True) # 获取超链接标签 - for a_tag in all_a_tag: - href = a_tag['href'] # 超链接字符串 - if re.search('/abstract/', href): # 文章的链接 - if re.search('https://journals.aps.org', href)==None: # 如果链接不是完整的,那么补充完整 - href = 'https://journals.aps.org'+ href - if href not in match_href and re.search('\?', href)==None: # 链接不重复 - match_href.append(href) - f.write('
  • ') - f.write(a_tag.get_text()) - f.write('  ') - info = article.find('h6', {"class": "pub-info"}).get_text() - f.write(re.findall('– Published .*', info, re.S)[0][12:]+'
  • ') -f.close() - diff --git a/2021.11.17_zhihu/prl.py b/2021.11.17_zhihu/prl.py deleted file mode 100644 index fb1f39a..0000000 --- a/2021.11.17_zhihu/prl.py +++ /dev/null @@ -1,42 +0,0 @@ -from bs4 import BeautifulSoup -from urllib.request import urlopen -import re -import datetime - - -year = datetime.datetime.now().year -month = datetime.datetime.now().month -day = datetime.datetime.now().day - - -f = open('prl.html', 'w', encoding='UTF-8') -f.write('') -f.write('

    '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新

    ') - -match_href = [] -for loop in range(1): - if loop == 0: - start_link = "https://journals.aps.org/prl/recent" # 看第一页 - # elif loop == 1: - # start_link = "https://journals.aps.org/prl/recent?page=2" # 看第二页 - html = urlopen(start_link).read().decode('utf-8') # 打开网页 - soup = BeautifulSoup(html, features='lxml') # 放入soup中 - all_article = soup.find_all('div', {"class":"article panel article-result"}) - for article in all_article: - all_a_tag = article.find_all('a', href=True) # 获取超链接标签 - for a_tag in all_a_tag: - href = a_tag['href'] # 超链接字符串 - if re.search('/abstract/', href): # 文章的链接 - if re.search('https://journals.aps.org', href)==None: # 如果链接不是完整的,那么补充完整 - href = 'https://journals.aps.org'+ href - if href not in match_href and re.search('\?', href)==None: # 链接不重复 - match_href.append(href) - f.write('
  • ') - f.write(a_tag.get_text()) - f.write('  ') - info = article.find('h6', {"class": "pub-info"}).get_text() - f.write(re.findall('– Published.*', info, re.S)[0][12:]+'
  • ') -f.close() - diff --git a/2021.11.17_zhihu/zhihu.py b/2021.11.17_zhihu/zhihu.py deleted file mode 100644 index a81dc03..0000000 --- a/2021.11.17_zhihu/zhihu.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/17937 -""" - -from bs4 import BeautifulSoup -from urllib.request import urlopen -import re -import datetime - -year = datetime.datetime.now().year -month = datetime.datetime.now().month -day = datetime.datetime.now().day - -# 获取链接 - -# 由于没有模拟登录知乎,因此只能爬取到最新的两篇文章 -authors = ["https://www.zhihu.com/people/guanjihuan/posts"] # Guan - -match_href = [] -for i0 in range(len(authors)): - start_link = authors[i0] - html = urlopen(start_link).read().decode('utf-8') # 打开网页 - soup = BeautifulSoup(html, features='lxml') # 放入soup中 - all_a_tag = soup.find_all('a', href=True) # 获取超链接标签 - for a_tag in all_a_tag: - href = a_tag['href'] # 超链接字符串 - if re.search('//zhuanlan.zhihu.com/p/', href) and not re.search('edit', href): # 文章的链接 - if re.search('https:', href)==None: # 如果链接不是完整的,那么补充完整 - href = 'https:'+ href - if href not in match_href: - match_href.append(href) - - -# 对链接进行排序 -numbers = [] -match_href_new = [] -for href in match_href: - numbers.append(int(href[29:])) -numbers.sort(reverse = True) -for n in numbers: - match_href_new.append('https://zhuanlan.zhihu.com/p/'+str(n)) - - -# 获取内容并写入文件 -f = open('zhihu.html', 'w', encoding='UTF-8') -f.write('') - -f.write('

    '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新

    ') -for href in match_href_new: - try: - html = urlopen(href).read().decode('utf-8') # 打开文章链接 - soup = BeautifulSoup(html, features='lxml') # 放入soup中 - title = soup.title # 文章标题 - f.write('
  • ') - f.write(str(title.get_text()[:-5])) - f.write('  ') - author = soup.find("span", {"class": "UserLink AuthorInfo-name"}) - f.write(str(author.get_text()+'  ')) - post_time = soup.find("div", {"class" : "ContentItem-time"}) - f.write(str(post_time.get_text()[4:-6])+'
  • ') - except: - pass -f.close() \ No newline at end of file