From da6217459cb03068d59f3c1ffb6ea872de8c2b57 Mon Sep 17 00:00:00 2001 From: guanjihuan <34735497+guanjihuan@users.noreply.github.com> Date: Fri, 11 Jun 2021 19:23:32 +0800 Subject: [PATCH] update --- .../find_common_words_on_APS.py | 89 ------------------- ...service_watching_recent_at_regular_time.py | 59 ------------ .../collect_prb_abstracts.py | 88 ++++++++++++++++++ .../collect_prl_abstracts.py | 88 ++++++++++++++++++ .../count_words.py | 41 +++++++++ 5 files changed, 217 insertions(+), 148 deletions(-) delete mode 100755 language_learning/2021.06.07_find_common_words_on_APS/find_common_words_on_APS.py delete mode 100755 language_learning/2021.06.07_find_common_words_on_APS/service_watching_recent_at_regular_time.py create mode 100644 language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prb_abstracts.py create mode 100644 language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prl_abstracts.py create mode 100644 language_learning/2021.06.07_find_common_words_on_APS_abstracts/count_words.py diff --git a/language_learning/2021.06.07_find_common_words_on_APS/find_common_words_on_APS.py b/language_learning/2021.06.07_find_common_words_on_APS/find_common_words_on_APS.py deleted file mode 100755 index be4eb6c..0000000 --- a/language_learning/2021.06.07_find_common_words_on_APS/find_common_words_on_APS.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623 -""" - -from bs4 import BeautifulSoup -from urllib.request import urlopen -import re -from collections import Counter -import datetime - - -""" -支持APS系列的首页和recent页面。 -例如PRB期刊:https://journals.aps.org/prb 和 https://journals.aps.org/prb/recent -请勿大量、循环运行爬虫,防止IP地址被官网拉入黑名单。 -""" - - -# 访问链接 -visit_link = "https://journals.aps.org/prb" -# visit_link = "https://journals.aps.org/prb/recent" -# visit_link = "https://journals.aps.org/prl" -# visit_link = "https://journals.aps.org/prl/recent" - - -# 获取文章链接 -html = urlopen(visit_link).read().decode('utf-8') -soup = BeautifulSoup(html, features='lxml') -all_a_tag = soup.find_all('a', href=True) -match_href = [] -for a_tag in all_a_tag: - href = a_tag['href'] - if re.search('https://journals.aps.org/.*/abstract', href) or re.search('.*/abstract/', href): - if href not in match_href and re.search('\?', href)==None: - if re.search('https://journals.aps.org', href)==None: - href = 'https://journals.aps.org'+ href - match_href.append(href) -print('\n当前页面总共有', len(match_href), '篇文章。\n') - - -# 获取文章中的摘要内容 -i0 = 0 -year = datetime.datetime.now().year -month = datetime.datetime.now().month -day = datetime.datetime.now().day -f = open(str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'_word_list.txt', 'w', encoding='UTF-8') -ignore_inner = ['alt="Figure', 'the', '.*
  • Received', html, re.S)[0] - word_list = abstract.split(' ') - word_list_for_one_href = [] - for word in word_list: - if 1<', word)==None: - if word not in word_list_for_one_href: # 每篇文章的某个词汇只统计一次 - word_list_for_one_href.append(word) - f.write(str(word)+' ') -f.close() - - -""" -运行一次后,以上的代码可以注释,不需要多次访问网址。 -以下代码调用的是上面代码生成的txt文件,可个性选择忽略的词汇,多次运行调试。 -""" - - -# 个性选择忽略的词汇(根据需要增删) -ignore = [] -year = datetime.datetime.now().year -month = datetime.datetime.now().month -day = datetime.datetime.now().day -with open(str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'_word_list.txt', 'r', encoding='UTF-8') as f: - word_list = f.read().split(' ') -effective_words = [] -for word in word_list: - if 1.*
  • Received', html, re.S)[0] - word_list = abstract.split(' ') - word_list_for_one_href = [] - for word in word_list: - if 1', word)==None and re.search('href', word)==None: - if word not in word_list_for_one_href: - word_list_for_one_href.append(word) - all_word_list.append(word) - most_common_words = Counter(all_word_list).most_common(num) - f.write(str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'|number_of_papers='+str(len(match_href))) - for word in most_common_words: - f.write('|'+str(word)) - f.write('\n\n') - f.write(content_before) - f.close() \ No newline at end of file diff --git a/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prb_abstracts.py b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prb_abstracts.py new file mode 100644 index 0000000..633fdb1 --- /dev/null +++ b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prb_abstracts.py @@ -0,0 +1,88 @@ +""" +This code is supported by the website: https://www.guanjihuan.com +The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623 +""" + +from bs4 import BeautifulSoup +from urllib.request import urlopen +import re +from collections import Counter +import datetime +import random +import time + + +# time.sleep(random.uniform(0,1800)) # 爬虫简单伪装,在固定时间后0到30分钟后开始运行。调试的时候把该语句注释。 +year = datetime.datetime.now().year +month = datetime.datetime.now().month +day = datetime.datetime.now().day + + +# 获取链接 +try: + with open('prb_link_list.txt', 'r', encoding='UTF-8') as f: # 如果文件存在 + link_list = f.read().split('\n') # 历史已经访问过的链接(数组类型) +except: + with open('prb_link_list.txt', 'w', encoding='UTF-8') as f: # 如果文件不存在 + link_list = [] +f = open('prb_link_list.txt', 'a', encoding='UTF-8') # 打开文件(补充) +f.write('\nLink list obtained on '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+':\n') +match_href = [] # 在本次运行中满足条件的链接 +for loop in range(3): + if loop == 0: + start_link = "https://journals.aps.org/prb/recent?page=1" # 看第一页 + elif loop == 1: + start_link = "https://journals.aps.org/prb/recent?page=2" # 看第二页 + elif loop == 2: + start_link = "https://journals.aps.org/prb/recent?page=3" # 看第三页(三页基本上覆盖了当天的所有更新) + html = urlopen(start_link).read().decode('utf-8') # 打开网页 + soup = BeautifulSoup(html, features='lxml') # 放入soup中 + all_a_tag = soup.find_all('a', href=True) # 获取超链接标签 + for a_tag in all_a_tag: + href = a_tag['href'] # 超链接字符串 + if re.search('/abstract/', href): # 文章的链接 + if re.search('https://journals.aps.org', href)==None: # 如果链接不是完整的,那么补充完整 + href = 'https://journals.aps.org'+ href + if href not in match_href and href not in link_list and re.search('\?', href)==None: # 链接不重复 + match_href.append(href) + f.write(href+'\n') +f.close() + + + +# 获取摘要 +try: + f = open('prb_all.txt', 'a', encoding='UTF-8') # 全部记录 +except: + f = open('prb_all.txt', 'w', encoding='UTF-8') # 如果文件不存在 +try: + f_month = open('prb_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'a', encoding='UTF-8') # 一个月的记录 +except: + f_month = open('prb_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'w', encoding='UTF-8') # 如果文件不存在 +f.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n') +f_month.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n') +print('total number=', len(match_href)) # 调试的时候显示这个 +i00 = 0 +for href in match_href: + i00 += 1 + print('reading number', i00, '...') # 调试的时候显示这个 + # time.sleep(random.uniform(10,110)) # 爬虫简单伪装,休息一分钟左右。如果链接个数有60个,那么程序运行时间延长60分钟。调试的时候把该语句注释。 + try: + html = urlopen(href).read().decode('utf-8') # 打开文章链接 + soup = BeautifulSoup(html, features='lxml') # 放入soup中 + title = soup.title # 文章标题 + f.write(str(title.get_text())+'\n\n') + f_month.write(str(title.get_text())+'\n\n') + f.write(str(href)+'\n\n') # 文章链接 + f_month.write(str(href)+'\n\n') + abstract = re.findall('"yes">

    .*

    ', word)==None: # 有些内容满足过滤条件,因此信息可能会丢失。 + f.write(word+' ') + f_month.write(word+' ') + f.write('\n\n\n') + f_month.write('\n\n\n') + except: + pass +f.close() \ No newline at end of file diff --git a/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prl_abstracts.py b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prl_abstracts.py new file mode 100644 index 0000000..f7ef4e0 --- /dev/null +++ b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prl_abstracts.py @@ -0,0 +1,88 @@ +""" +This code is supported by the website: https://www.guanjihuan.com +The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623 +""" + +from bs4 import BeautifulSoup +from urllib.request import urlopen +import re +from collections import Counter +import datetime +import random +import time + + +# time.sleep(random.uniform(0,1800)) # 爬虫简单伪装,在固定时间后0到30分钟后开始运行。调试的时候把该语句注释。 +year = datetime.datetime.now().year +month = datetime.datetime.now().month +day = datetime.datetime.now().day + + +# 获取链接 +try: + with open('prl_link_list.txt', 'r', encoding='UTF-8') as f: # 如果文件存在 + link_list = f.read().split('\n') # 历史已经访问过的链接(数组类型) +except: + with open('prl_link_list.txt', 'w', encoding='UTF-8') as f: # 如果文件不存在 + link_list = [] +f = open('prl_link_list.txt', 'a', encoding='UTF-8') # 打开文件(补充) +f.write('\nLink list obtained on '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+':\n') +match_href = [] # 在本次运行中满足条件的链接 +for loop in range(3): + if loop == 0: + start_link = "https://journals.aps.org/prl/recent?page=1" # 看第一页 + elif loop == 1: + start_link = "https://journals.aps.org/prl/recent?page=2" # 看第二页 + elif loop == 2: + start_link = "https://journals.aps.org/prl/recent?page=3" # 看第三页(三页基本上覆盖了当天的所有更新) + html = urlopen(start_link).read().decode('utf-8') # 打开网页 + soup = BeautifulSoup(html, features='lxml') # 放入soup中 + all_a_tag = soup.find_all('a', href=True) # 获取超链接标签 + for a_tag in all_a_tag: + href = a_tag['href'] # 超链接字符串 + if re.search('/abstract/', href): # 文章的链接 + if re.search('https://journals.aps.org', href)==None: # 如果链接不是完整的,那么补充完整 + href = 'https://journals.aps.org'+ href + if href not in match_href and href not in link_list and re.search('\?', href)==None: # 链接不重复 + match_href.append(href) + f.write(href+'\n') +f.close() + + + +# 获取摘要 +try: + f = open('prl_all.txt', 'a', encoding='UTF-8') # 全部记录 +except: + f = open('prl_all.txt', 'w', encoding='UTF-8') # 如果文件不存在 +try: + f_month = open('prl_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'a', encoding='UTF-8') # 一个月的记录 +except: + f_month = open('prl_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'w', encoding='UTF-8') # 如果文件不存在 +f.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n') +f_month.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n') +print('total number=', len(match_href)) # 调试的时候显示这个 +i00 = 0 +for href in match_href: + i00 += 1 + print('reading number', i00, '...') # 调试的时候显示这个 + # time.sleep(random.uniform(10,110)) # 爬虫简单伪装,休息一分钟左右。如果链接个数有60个,那么程序运行时间延长60分钟。调试的时候把该语句注释。 + try: + html = urlopen(href).read().decode('utf-8') # 打开文章链接 + soup = BeautifulSoup(html, features='lxml') # 放入soup中 + title = soup.title # 文章标题 + f.write(str(title.get_text())+'\n\n') + f_month.write(str(title.get_text())+'\n\n') + f.write(str(href)+'\n\n') # 文章链接 + f_month.write(str(href)+'\n\n') + abstract = re.findall('"yes">

    .*

    ', word)==None: # 有些内容满足过滤条件,因此信息可能会丢失。 + f.write(word+' ') + f_month.write(word+' ') + f.write('\n\n\n') + f_month.write('\n\n\n') + except: + pass +f.close() \ No newline at end of file diff --git a/language_learning/2021.06.07_find_common_words_on_APS_abstracts/count_words.py b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/count_words.py new file mode 100644 index 0000000..b3d179b --- /dev/null +++ b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/count_words.py @@ -0,0 +1,41 @@ +""" +This code is supported by the website: https://www.guanjihuan.com +The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623 +""" + +import re +from collections import Counter + + +def main(): + file_name = 'prb_all.txt' + with open(file_name, 'r', encoding='UTF-8') as f: # 打开文件 + paper_list = f.read().split('\n\n\n') # 通过三个回车划分不同文章 + word_list = [] + ignore = ignore_words() # 过滤常见单词 + for paper in paper_list: + word_list_in_one_paper = [] + if len(paper)>20: # 通过字符串长度过滤日期 + content_list = paper.split('\n\n') # 通过两个回车划分内容 + for content in content_list: + if re.search('https://', content)==None: # 过滤文章链接 + words = content.split(' ') # 通过空格划分单词 + for word in words: + if word not in word_list_in_one_paper: # 一篇文章的某个单词只统计一次 + if word not in ignore and len(word)>1: # 过滤词汇 + word_list.append(word) + word_list_in_one_paper.append(word) + num = 300 + most_common_words = Counter(word_list).most_common(num) # 统计出现最多的num个词汇 + print('\n出现频率最高的前', num, '个词汇:') + for word in most_common_words: + print(word) + + +def ignore_words(): # 可自行增删 + ignore = ['Phys.', 'the', 'to', 'of', 'in', 'under', 'and', 'by', 'The', 'at', 'with', 'up', 'be', 'above', 'below', 'are', 'is', 'for', 'that', 'as', 'we', '