update

2021-06-11 19:23:32 +08:00
parent 14a297b604
commit da6217459c
5 changed files with 217 additions and 148 deletions
--- a/language_learning/2021.06.07_find_common_words_on_APS/find_common_words_on_APS.py
+++ b/language_learning/2021.06.07_find_common_words_on_APS/find_common_words_on_APS.py
@@ -1,89 +0,0 @@
-"""
-This code is supported by the website: https://www.guanjihuan.com
-The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623
-"""
-
-from bs4 import BeautifulSoup
-from urllib.request import urlopen
-import re  
-from collections import Counter
-import datetime
-
-
-"""
-支持APS系列的首页和recent页面。
-例如PRB期刊：https://journals.aps.org/prb 和 https://journals.aps.org/prb/recent
-请勿大量、循环运行爬虫，防止IP地址被官网拉入黑名单。
-"""
-
-
-# 访问链接
-visit_link = "https://journals.aps.org/prb"
-# visit_link = "https://journals.aps.org/prb/recent"
-# visit_link = "https://journals.aps.org/prl"
-# visit_link = "https://journals.aps.org/prl/recent"
-
-
-# 获取文章链接
-html = urlopen(visit_link).read().decode('utf-8')
-soup = BeautifulSoup(html, features='lxml')
-all_a_tag = soup.find_all('a', href=True)
-match_href = []
-for a_tag in all_a_tag:
-    href = a_tag['href']
-    if re.search('https://journals.aps.org/.*/abstract', href) or re.search('.*/abstract/', href):
-        if href not in match_href and re.search('\?', href)==None: 
-            if re.search('https://journals.aps.org', href)==None:
-                href = 'https://journals.aps.org'+ href
-            match_href.append(href)
-print('\n当前页面总共有', len(match_href), '篇文章。\n')
-
-
-# 获取文章中的摘要内容
-i0 = 0
-year = datetime.datetime.now().year
-month = datetime.datetime.now().month
-day = datetime.datetime.now().day
-f = open(str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'_word_list.txt', 'w', encoding='UTF-8') 
-ignore_inner = ['alt="Figure', 'the', '<math', 'to', 'of', 'in', 'under', '<i', 'and', 'by', 'The', 'at', 'with', 'up', 'be', 'above', 'below', 'are', 'is', 'for', 'that', 'as', 'we', '<a', 'abstract', 'abstract"','<span', 'which', 'We', 'such', 'has', 'two', 'these', 'it', 'all', 'results', 'result', 'each', 'have', 'between', 'on', 'an', 'can', 'also', 'from', 'Our', 'our', 'using', 'where', 'These', 'out', 'both', 'due', 'less', 'along', 'but', 'In', 'show', 'into', 'study', 'find', 'provide', 'change', '(<math','not', 'open', 'this', 'show', 'into', 'study', 'find', 'provide', 'change', 'present', 'Using', 'large', 'This', 'However', 'appear', 'studied', 'obtain', 'been', 'Both', 'they', 'effects', 'effect', 'compute', 'more', 'does', 'shown', 'Based', 'reveal', 'highly', 'number', 'However,', 'was', 'near', 'full', 'based', 'several', 'suggest', 'agreement', 'predicted', 'values', 'work', 'emphasize', 'without', 'or', 'work,', 'studies', 'future', 'identify', 'present.', 'predict', 'presence', 'their', 'were', 'From', 'its', 'By', 'how', 'ground', 'observed', 'recent', 'For', 'other', 'Here', 'test', 'further', 'Its', 'similar', 'however,', 'range', 'within', 'value', 'possible', 'may', 'than', 'low', 'us', 'obtained', 'around', 'consider', 'about', 'very', 'will', 'when', 'played', 'consist', 'consists', 'Here,', 'observe', 'gives', 'It', 'over', 'cannot', 'As', 'whose', 'new', 'some', 'only', 'from', 'yields', 'shows', 'data', 'direct', 'related', 'different', 'evidence', 'role', 'function', 'origin', 'specific', 'set', 'confirm', 'give', 'Moreover', 'develop', 'including', 'could', 'used', 'means', 'allows', 'make', 'e.g.,', 'provides', 'system', 'systems', 'field', 'fields', 'model', 'model,', 'state', 'states', 'states.', 'state.', 'band', 'bands', 'method', 'methods', 'nature', 'rate', 'zero', 'single', 'theory', 'first', 'one', 'complex', 'approach', 'schemes', 'terms', 'even', 'case', 'analysis', 'weight', 'volume', 'evolution', 'well', 'external', 'measured', 'introducing', 'dependence', 'properties', 'demonstrate', 'remains', 'through', 'measurements', 'samples', 'findings', 'respect', 'investigate', 'behavior', 'importance', 'considered', 'experimental', 'increase', 'propose', 'follows', 'increase', 'emerged', 'interesting', 'behaviors', 'influenced', 'paramount', 'indicate', 'Rev.', 'concepts', 'induced', 'zone', 'regions', 'exact', 'contribution', 'behavior', 'formation', 'measurements.', 'utilizing', 'constant', 'regime', 'features', 'strength', 'compare', 'determined', 'combination', 'compare', 'determined', 'At', 'inside', 'ambient', 'then', 'important', 'report', 'Moreover,', 'Despite', 'found', 'because', 'process', 'and,', 'significantly', 'realized', 'much', 'natural', 'since', 'grows', 'any', 'compared', 'while', 'forms.', 'appears', 'indicating', 'coefficient', 'suggested', 'time', 'exhibits', 'calculations.', 'developed', 'array', 'discuss', 'field', 'becomes', 'allowing', 'indicates', 'via', 'introduce', 'considering', 'times.', 'constructed', 'explain', 'form', 'owing', 'parameters.', 'parameter', 'operation', 'probe', 'experiments', 'interest', 'strategies', 'seen', 'emerge', 'generic', 'geometry', 'numbers', 'observation', 'avenue', 'theretically', 'three', 'excellent', 'amount', 'notable', 'example', 'being', 'promising', 'latter', 'little', 'imposed', 'put', 'resource', 'together', 'produce', 'successfully','there', 'enhanced', 'this', 'great', 'dirven', 'increasing','should', 'otherwise', 'Further', 'field,', 'known', 'changes', 'still', 'beyond', 'various', 'center', 'previously', 'way', 'peculiar', 'detailed', 'understanding', 'good', 'years', 'where', 'Me', 'origins', 'years.', 'attributed', 'known,', 'them', 'reported', 'no', 'systems', 'agree', 'examined', 'rise', 'calculate', 'those', 'particular', 'relation', 'defined', 'either', 'again', 'current', 'exhibit', 'calculated', 'here', 'made', 'Further', 'consisting', 'constitutes', 'originated', 'if', 'exceed', 'access']
-for href in match_href: 
-    i0 += 1 
-    print('正在读取第', i0, '篇')
-    html = urlopen(href).read().decode('utf-8')
-    abstract = re.findall('<a name="abstract">.*<li>Received', html, re.S)[0]
-    word_list = abstract.split(' ')
-    word_list_for_one_href = []
-    for word in word_list:
-        if 1<len(word)<35 and word not in ignore_inner and re.search('class=', word)==None and re.search('data-', word)==None and re.search('><', word)==None:
-            if word not in word_list_for_one_href:  # 每篇文章的某个词汇只统计一次
-                word_list_for_one_href.append(word)
-                f.write(str(word)+' ')
-f.close()
-
-
-"""
-运行一次后，以上的代码可以注释，不需要多次访问网址。
-以下代码调用的是上面代码生成的txt文件，可个性选择忽略的词汇，多次运行调试。
-"""
-
-
-# 个性选择忽略的词汇（根据需要增删）
-ignore = []
-year = datetime.datetime.now().year
-month = datetime.datetime.now().month
-day = datetime.datetime.now().day
-with open(str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'_word_list.txt', 'r', encoding='UTF-8') as f: 
-    word_list = f.read().split(' ')
-effective_words = []  
-for word in word_list:
-    if 1<len(word)<35 and word not in ignore:
-        effective_words.append(word)
-
-
-# 统计出现最多的n个词汇
-num = 50
-most_common_words = Counter(effective_words).most_common(num)
-print('\n出现频率最高的前', num, '个词汇：')
-for word in most_common_words:
-    print(word)
-print()
--- a/language_learning/2021.06.07_find_common_words_on_APS/service_watching_recent_at_regular_time.py
+++ b/language_learning/2021.06.07_find_common_words_on_APS/service_watching_recent_at_regular_time.py
@@ -1,59 +0,0 @@
-"""
-This code is supported by the website: https://www.guanjihuan.com
-The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623
-"""
-
-from bs4 import BeautifulSoup
-from urllib.request import urlopen
-import re  
-from collections import Counter
-import datetime
-import random
-import time
-
-ignore_inner = ['alt="Figure', 'the', '<math', 'to', 'of', 'in', 'under', '<i', 'and', 'by', 'The', 'at', 'with', 'up', 'be', 'above', 'below', 'are', 'is', 'for', 'that', 'as', 'we', '<a', 'abstract', 'abstract"','<span', 'which', 'We', 'such', 'has', 'two', 'these', 'it', 'all', 'results', 'result', 'each', 'have', 'between', 'on', 'an', 'can', 'also', 'from', 'Our', 'our', 'using', 'where', 'These', 'out', 'both', 'due', 'less', 'along', 'but', 'In', 'show', 'into', 'study', 'find', 'provide', 'change', '(<math','not', 'open', 'this', 'show', 'into', 'study', 'find', 'provide', 'change', 'present', 'Using', 'large', 'This', 'However', 'appear', 'studied', 'obtain', 'been', 'Both', 'they', 'effects', 'effect', 'compute', 'more', 'does', 'shown', 'Based', 'reveal', 'highly', 'number', 'However,', 'was', 'near', 'full', 'based', 'several', 'suggest', 'agreement', 'predicted', 'values', 'work', 'emphasize', 'without', 'or', 'work,', 'studies', 'future', 'identify', 'present.', 'predict', 'presence', 'their', 'were', 'From', 'its', 'By', 'how', 'ground', 'observed', 'recent', 'For', 'other', 'Here', 'test', 'further', 'Its', 'similar', 'however,', 'range', 'within', 'value', 'possible', 'may', 'than', 'low', 'us', 'obtained', 'around', 'consider', 'about', 'very', 'will', 'when', 'played', 'consist', 'consists', 'Here,', 'observe', 'gives', 'It', 'over', 'cannot', 'As', 'whose', 'new', 'some', 'only', 'from', 'yields', 'shows', 'data', 'direct', 'related', 'different', 'evidence', 'role', 'function', 'origin', 'specific', 'set', 'confirm', 'give', 'Moreover', 'develop', 'including', 'could', 'used', 'means', 'allows', 'make', 'e.g.,', 'provides', 'system', 'systems', 'field', 'fields', 'model', 'model,', 'state', 'states', 'states.', 'state.', 'band', 'bands', 'method', 'methods', 'nature', 'rate', 'zero', 'single', 'theory', 'first', 'one', 'complex', 'approach', 'schemes', 'terms', 'even', 'case', 'analysis', 'weight', 'volume', 'evolution', 'well', 'external', 'measured', 'introducing', 'dependence', 'properties', 'demonstrate', 'remains', 'through', 'measurements', 'samples', 'findings', 'respect', 'investigate', 'behavior', 'importance', 'considered', 'experimental', 'increase', 'propose', 'follows', 'increase', 'emerged', 'interesting', 'behaviors', 'influenced', 'paramount', 'indicate', 'Rev.', 'concepts', 'induced', 'zone', 'regions', 'exact', 'contribution', 'behavior', 'formation', 'measurements.', 'utilizing', 'constant', 'regime', 'features', 'strength', 'compare', 'determined', 'combination', 'compare', 'determined', 'At', 'inside', 'ambient', 'then', 'important', 'report', 'Moreover,', 'Despite', 'found', 'because', 'process', 'and,', 'significantly', 'realized', 'much', 'natural', 'since', 'grows', 'any', 'compared', 'while', 'forms.', 'appears', 'indicating', 'coefficient', 'suggested', 'time', 'exhibits', 'calculations.', 'developed', 'array', 'discuss', 'field', 'becomes', 'allowing', 'indicates', 'via', 'introduce', 'considering', 'times.', 'constructed', 'explain', 'form', 'owing', 'parameters.', 'parameter', 'operation', 'probe', 'experiments', 'interest', 'strategies', 'seen', 'emerge', 'generic', 'geometry', 'numbers', 'observation', 'avenue', 'theretically', 'three', 'excellent', 'amount', 'notable', 'example', 'being', 'promising', 'latter', 'little', 'imposed', 'put', 'resource', 'together', 'produce', 'successfully','there', 'enhanced', 'this', 'great', 'dirven', 'increasing','should', 'otherwise', 'Further', 'field,', 'known', 'changes', 'still', 'beyond', 'various', 'center', 'previously', 'way', 'peculiar', 'detailed', 'understanding', 'good', 'years', 'where', 'Me', 'origins', 'years.', 'attributed', 'known,', 'them', 'reported', 'no', 'systems', 'agree', 'examined', 'rise', 'calculate', 'those', 'particular', 'relation', 'defined', 'either', 'again', 'current', 'exhibit', 'calculated', 'here', 'made', 'Further', 'consisting', 'constitutes', 'originated', 'if', 'exceed', 'access']
-num = 50
-year = datetime.datetime.now().year
-month = datetime.datetime.now().month
-day = datetime.datetime.now().day
-for loop in range(2):
-    if loop == 0:
-        visit_link = "https://journals.aps.org/prb/recent"
-        with open('prb_recent_most_common_words.txt', 'r', encoding='UTF-8') as f0:
-            content_before = f0.read()
-        f = open('prb_recent_most_common_words.txt', 'w', encoding='UTF-8') 
-    elif loop == 1:
-        visit_link = "https://journals.aps.org/prl/recent"
-        with open('prl_recent_most_common_words.txt', 'r', encoding='UTF-8') as f0:
-            content_before = f0.read()
-        f = open('prl_recent_most_common_words.txt', 'w', encoding='UTF-8') 
-    html = urlopen(visit_link).read().decode('utf-8')
-    soup = BeautifulSoup(html, features='lxml')
-    all_a_tag = soup.find_all('a', href=True)
-    match_href = []
-    for a_tag in all_a_tag:
-        href = a_tag['href']
-        if re.search('https://journals.aps.org/.*/abstract', href) or re.search('.*/abstract/', href):
-            if href not in match_href  and re.search('\?', href)==None:
-                if re.search('https://journals.aps.org', href)==None:
-                    href = 'https://journals.aps.org'+ href
-                match_href.append(href)
-    all_word_list = []
-    for href in match_href: 
-        time.sleep(random.uniform(0,2))  # 爬虫休息一秒左右，简单伪装
-        html = urlopen(href).read().decode('utf-8')
-        abstract = re.findall('<a name="abstract">.*<li>Received', html, re.S)[0]
-        word_list = abstract.split(' ')
-        word_list_for_one_href = []
-        for word in word_list:
-            if 1<len(word)<35 and word not in ignore_inner and re.search('class=', word)==None and re.search('data-', word)==None and re.search('<', word)==None and re.search('>', word)==None and re.search('href', word)==None:
-                if word not in word_list_for_one_href:
-                    word_list_for_one_href.append(word)
-                    all_word_list.append(word)
-    most_common_words = Counter(all_word_list).most_common(num)
-    f.write(str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'|number_of_papers='+str(len(match_href)))
-    for word in most_common_words:
-        f.write('|'+str(word))
-    f.write('\n\n')
-    f.write(content_before)
-    f.close()
--- a/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prb_abstracts.py
+++ b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prb_abstracts.py
@@ -0,0 +1,88 @@
+"""
+This code is supported by the website: https://www.guanjihuan.com
+The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623
+"""
+
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
+import re  
+from collections import Counter
+import datetime
+import random
+import time
+
+
+# time.sleep(random.uniform(0,1800))  # 爬虫简单伪装，在固定时间后0到30分钟后开始运行。调试的时候把该语句注释。
+year = datetime.datetime.now().year
+month = datetime.datetime.now().month
+day = datetime.datetime.now().day
+
+
+# 获取链接
+try:
+    with open('prb_link_list.txt', 'r', encoding='UTF-8') as f:  # 如果文件存在
+        link_list = f.read().split('\n')   # 历史已经访问过的链接（数组类型）
+except:
+    with open('prb_link_list.txt', 'w', encoding='UTF-8') as f:  # 如果文件不存在
+        link_list = [] 
+f = open('prb_link_list.txt', 'a', encoding='UTF-8')  # 打开文件（补充）
+f.write('\nLink list obtained on '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+':\n')
+match_href = []  # 在本次运行中满足条件的链接
+for loop in range(3):
+    if loop == 0:
+        start_link = "https://journals.aps.org/prb/recent?page=1"  # 看第一页
+    elif loop == 1:
+        start_link = "https://journals.aps.org/prb/recent?page=2"  # 看第二页
+    elif loop == 2: 
+        start_link = "https://journals.aps.org/prb/recent?page=3"  # 看第三页（三页基本上覆盖了当天的所有更新）
+    html = urlopen(start_link).read().decode('utf-8')  # 打开网页
+    soup = BeautifulSoup(html, features='lxml') # 放入soup中
+    all_a_tag = soup.find_all('a', href=True)  # 获取超链接标签
+    for a_tag in all_a_tag:
+        href = a_tag['href']  # 超链接字符串
+        if re.search('/abstract/', href): # 文章的链接
+            if re.search('https://journals.aps.org', href)==None:  # 如果链接不是完整的，那么补充完整
+                href = 'https://journals.aps.org'+ href
+            if href not in match_href and href not in link_list and re.search('\?', href)==None:  # 链接不重复
+                match_href.append(href)
+                f.write(href+'\n')
+f.close()
+
+
+
+# 获取摘要
+try:
+    f = open('prb_all.txt', 'a', encoding='UTF-8')  # 全部记录
+except:
+    f = open('prb_all.txt', 'w', encoding='UTF-8')  # 如果文件不存在
+try:
+    f_month = open('prb_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'a', encoding='UTF-8')  # 一个月的记录
+except:
+    f_month = open('prb_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'w', encoding='UTF-8')  # 如果文件不存在
+f.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n')
+f_month.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n')
+print('total number=', len(match_href))  # 调试的时候显示这个
+i00 = 0
+for href in match_href: 
+    i00 += 1
+    print('reading number', i00, '...')  # 调试的时候显示这个
+    # time.sleep(random.uniform(10,110))  # 爬虫简单伪装，休息一分钟左右。如果链接个数有60个，那么程序运行时间延长60分钟。调试的时候把该语句注释。
+    try:
+        html = urlopen(href).read().decode('utf-8')   # 打开文章链接
+        soup = BeautifulSoup(html, features='lxml') # 放入soup中
+        title = soup.title   # 文章标题
+        f.write(str(title.get_text())+'\n\n')   
+        f_month.write(str(title.get_text())+'\n\n') 
+        f.write(str(href)+'\n\n')   # 文章链接
+        f_month.write(str(href)+'\n\n') 
+        abstract = re.findall('"yes"><p>.*</p><div', html, re.S)[0][9:-8]  # 文章摘要
+        word_list = abstract.split(' ')  # 划分单词
+        for word in word_list:
+            if re.search('<', word)==None and re.search('>', word)==None:  # 有些内容满足过滤条件，因此信息可能会丢失。
+                f.write(word+' ')
+                f_month.write(word+' ')
+        f.write('\n\n\n')
+        f_month.write('\n\n\n')
+    except:
+        pass
+f.close()
--- a/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prl_abstracts.py
+++ b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/collect_prl_abstracts.py
@@ -0,0 +1,88 @@
+"""
+This code is supported by the website: https://www.guanjihuan.com
+The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623
+"""
+
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
+import re  
+from collections import Counter
+import datetime
+import random
+import time
+
+
+# time.sleep(random.uniform(0,1800))  # 爬虫简单伪装，在固定时间后0到30分钟后开始运行。调试的时候把该语句注释。
+year = datetime.datetime.now().year
+month = datetime.datetime.now().month
+day = datetime.datetime.now().day
+
+
+# 获取链接
+try:
+    with open('prl_link_list.txt', 'r', encoding='UTF-8') as f:  # 如果文件存在
+        link_list = f.read().split('\n')   # 历史已经访问过的链接（数组类型）
+except:
+    with open('prl_link_list.txt', 'w', encoding='UTF-8') as f:  # 如果文件不存在
+        link_list = [] 
+f = open('prl_link_list.txt', 'a', encoding='UTF-8')  # 打开文件（补充）
+f.write('\nLink list obtained on '+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+':\n')
+match_href = []  # 在本次运行中满足条件的链接
+for loop in range(3):
+    if loop == 0:
+        start_link = "https://journals.aps.org/prl/recent?page=1"  # 看第一页
+    elif loop == 1:
+        start_link = "https://journals.aps.org/prl/recent?page=2"  # 看第二页
+    elif loop == 2: 
+        start_link = "https://journals.aps.org/prl/recent?page=3"  # 看第三页（三页基本上覆盖了当天的所有更新）
+    html = urlopen(start_link).read().decode('utf-8')  # 打开网页
+    soup = BeautifulSoup(html, features='lxml') # 放入soup中
+    all_a_tag = soup.find_all('a', href=True)  # 获取超链接标签
+    for a_tag in all_a_tag:
+        href = a_tag['href']  # 超链接字符串
+        if re.search('/abstract/', href): # 文章的链接
+            if re.search('https://journals.aps.org', href)==None:  # 如果链接不是完整的，那么补充完整
+                href = 'https://journals.aps.org'+ href
+            if href not in match_href and href not in link_list and re.search('\?', href)==None:  # 链接不重复
+                match_href.append(href)
+                f.write(href+'\n')
+f.close()
+
+
+
+# 获取摘要
+try:
+    f = open('prl_all.txt', 'a', encoding='UTF-8')  # 全部记录
+except:
+    f = open('prl_all.txt', 'w', encoding='UTF-8')  # 如果文件不存在
+try:
+    f_month = open('prl_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'a', encoding='UTF-8')  # 一个月的记录
+except:
+    f_month = open('prl_'+str(year)+'.'+str(month).rjust(2,'0')+'.txt', 'w', encoding='UTF-8')  # 如果文件不存在
+f.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n')
+f_month.write('\n\n['+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+'][total number='+str(len(match_href))+']\n\n\n')
+print('total number=', len(match_href))  # 调试的时候显示这个
+i00 = 0
+for href in match_href: 
+    i00 += 1
+    print('reading number', i00, '...')  # 调试的时候显示这个
+    # time.sleep(random.uniform(10,110))  # 爬虫简单伪装，休息一分钟左右。如果链接个数有60个，那么程序运行时间延长60分钟。调试的时候把该语句注释。
+    try:
+        html = urlopen(href).read().decode('utf-8')   # 打开文章链接
+        soup = BeautifulSoup(html, features='lxml') # 放入soup中
+        title = soup.title   # 文章标题
+        f.write(str(title.get_text())+'\n\n')   
+        f_month.write(str(title.get_text())+'\n\n') 
+        f.write(str(href)+'\n\n')   # 文章链接
+        f_month.write(str(href)+'\n\n') 
+        abstract = re.findall('"yes"><p>.*</p><div', html, re.S)[0][9:-8]  # 文章摘要
+        word_list = abstract.split(' ')  # 划分单词
+        for word in word_list:
+            if re.search('<', word)==None and re.search('>', word)==None:  # 有些内容满足过滤条件，因此信息可能会丢失。
+                f.write(word+' ')
+                f_month.write(word+' ')
+        f.write('\n\n\n')
+        f_month.write('\n\n\n')
+    except:
+        pass
+f.close()
--- a/language_learning/2021.06.07_find_common_words_on_APS_abstracts/count_words.py
+++ b/language_learning/2021.06.07_find_common_words_on_APS_abstracts/count_words.py
@@ -0,0 +1,41 @@
+"""
+This code is supported by the website: https://www.guanjihuan.com
+The newest version of this code is on the web page: https://www.guanjihuan.com/archives/13623
+"""
+
+import re  
+from collections import Counter
+
+
+def main():
+    file_name = 'prb_all.txt'
+    with open(file_name, 'r', encoding='UTF-8') as f:  # 打开文件
+        paper_list = f.read().split('\n\n\n')  # 通过三个回车划分不同文章
+    word_list = []  
+    ignore = ignore_words()  # 过滤常见单词
+    for paper in paper_list:
+        word_list_in_one_paper = []
+        if len(paper)>20:  # 通过字符串长度过滤日期
+            content_list = paper.split('\n\n')  # 通过两个回车划分内容
+            for content in content_list:
+                if re.search('https://', content)==None: # 过滤文章链接
+                    words = content.split(' ')  # 通过空格划分单词
+                    for word in words:
+                        if word not in word_list_in_one_paper:  # 一篇文章的某个单词只统计一次
+                            if word not in ignore and len(word)>1:  # 过滤词汇
+                                word_list.append(word)
+                                word_list_in_one_paper.append(word)       
+    num = 300
+    most_common_words = Counter(word_list).most_common(num)  # 统计出现最多的num个词汇
+    print('\n出现频率最高的前', num, '个词汇：')
+    for word in most_common_words:
+        print(word)
+
+
+def ignore_words(): # 可自行增删
+    ignore = ['Phys.', 'the', 'to', 'of', 'in', 'under', 'and', 'by', 'The', 'at', 'with', 'up', 'be', 'above', 'below', 'are', 'is', 'for', 'that', 'as', 'we', '<a', 'abstract', 'abstract"','<span', 'which', 'We', 'such', 'has', 'two', 'these', 'it', 'all', 'results', 'result', 'each', 'have', 'between', 'on', 'an', 'can', 'also', 'from', 'Our', 'our', 'using', 'where', 'These', 'out', 'both', 'due', 'less', 'along', 'but', 'In', 'show', 'into', 'study', 'find', 'provide', 'change','not', 'open', 'this', 'show', 'into', 'study', 'find', 'provide', 'change', 'present', 'Using', 'large', 'This', 'However', 'appear', 'studied', 'obtain', 'been', 'Both', 'they', 'effects', 'effect', 'compute', 'more', 'does', 'shown', 'Based', 'reveal', 'highly', 'number', 'However,', 'was', 'near', 'full', 'based', 'several', 'suggest', 'agreement', 'predicted', 'values', 'work', 'emphasize', 'without', 'or', 'work,', 'studies', 'future', 'identify', 'present.', 'predict', 'presence', 'their', 'were', 'From', 'its', 'By', 'how', 'ground', 'observed', 'recent', 'For', 'other', 'Here', 'test', 'further', 'Its', 'similar', 'however,', 'range', 'within', 'value', 'possible', 'may', 'than', 'low', 'us', 'obtained', 'around', 'consider', 'about', 'very', 'will', 'when', 'played', 'consist', 'consists', 'Here,', 'observe', 'gives', 'It', 'over', 'cannot', 'As', 'whose', 'new', 'some', 'only', 'from', 'yields', 'shows', 'data', 'direct', 'related', 'different', 'evidence', 'role', 'function', 'origin', 'specific', 'set', 'confirm', 'give', 'Moreover', 'develop', 'including', 'could', 'used', 'means', 'allows', 'make', 'e.g.,', 'provides', 'system', 'systems', 'field', 'fields', 'model', 'model,', 'state', 'states', 'states.', 'state.', 'band', 'bands', 'method', 'methods', 'nature', 'rate', 'zero', 'single', 'theory', 'first', 'one', 'complex', 'approach', 'schemes', 'terms', 'even', 'case', 'analysis', 'weight', 'volume', 'evolution', 'well', 'external', 'measured', 'introducing', 'dependence', 'properties', 'demonstrate', 'remains', 'through', 'measurements', 'samples', 'findings', 'respect', 'investigate', 'behavior', 'importance', 'considered', 'experimental', 'increase', 'propose', 'follows', 'increase', 'emerged', 'interesting', 'behaviors', 'influenced', 'paramount', 'indicate', 'Rev.', 'concepts', 'induced', 'zone', 'regions', 'exact', 'contribution', 'behavior', 'formation', 'measurements.', 'utilizing', 'constant', 'regime', 'features', 'strength', 'compare', 'determined', 'combination', 'compare', 'determined', 'At', 'inside', 'ambient', 'then', 'important', 'report', 'Moreover,', 'Despite', 'found', 'because', 'process', 'and,', 'significantly', 'realized', 'much', 'natural', 'since', 'grows', 'any', 'compared', 'while', 'forms.', 'appears', 'indicating', 'coefficient', 'suggested', 'time', 'exhibits', 'calculations.', 'developed', 'array', 'discuss', 'field', 'becomes', 'allowing', 'indicates', 'via', 'introduce', 'considering', 'times.', 'constructed', 'explain', 'form', 'owing', 'parameters.', 'parameter', 'operation', 'probe', 'experiments', 'interest', 'strategies', 'seen', 'emerge', 'generic', 'geometry', 'numbers', 'observation', 'avenue', 'theretically', 'three', 'excellent', 'amount', 'notable', 'example', 'being', 'promising', 'latter', 'little', 'imposed', 'put', 'resource', 'together', 'produce', 'successfully','there', 'enhanced', 'this', 'great', 'dirven', 'increasing','should', 'otherwise', 'Further', 'field,', 'known', 'changes', 'still', 'beyond', 'various', 'center', 'previously', 'way', 'peculiar', 'detailed', 'understanding', 'good', 'years', 'where', 'Me', 'origins', 'years.', 'attributed', 'known,', 'them', 'reported', 'no', 'systems', 'agree', 'examined', 'rise', 'calculate', 'those', 'particular', 'relation', 'defined', 'either', 'again', 'current', 'exhibit', 'calculated', 'here', 'made', 'Further', 'consisting', 'constitutes', 'originated', 'if', 'exceed', 'access']
+    return ignore
+
+
+if __name__ == '__main__':
+    main()