update
This commit is contained in:
		
							
								
								
									
										38
									
								
								language_learning/2021.11.17_zhihu/nature_physics.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								language_learning/2021.11.17_zhihu/nature_physics.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| from bs4 import BeautifulSoup | ||||
| from urllib.request import urlopen | ||||
| import re   | ||||
| import datetime | ||||
|  | ||||
|  | ||||
| year = datetime.datetime.now().year | ||||
| month = datetime.datetime.now().month | ||||
| day = datetime.datetime.now().day | ||||
|  | ||||
|  | ||||
| f = open('nature_physics.html', 'w', encoding='UTF-8')  | ||||
| f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>') | ||||
| f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>') | ||||
|  | ||||
| match_href = [] | ||||
| start_link = "https://www.nature.com/nphys/research-articles" | ||||
| html = urlopen(start_link).read().decode('utf-8')  # 打开网页 | ||||
| soup = BeautifulSoup(html, features='lxml') # 放入soup中 | ||||
| all_article = soup.find_all('article', {"class":"u-full-height c-card c-card--flush"})  | ||||
| for article in all_article: | ||||
|     all_a_tag = article.find_all('a', href=True)  # 获取超链接标签 | ||||
|     for a_tag in all_a_tag: | ||||
|         href = a_tag['href']  # 超链接字符串 | ||||
|         if re.search('/articles/', href): # 文章的链接 | ||||
|             if re.search('https://www.nature.com', href)==None:  # 如果链接不是完整的,那么补充完整 | ||||
|                 href = 'https://www.nature.com'+ href | ||||
|             if href not in match_href and re.search('\?', href)==None:  # 链接不重复 | ||||
|                 match_href.append(href) | ||||
|                 f.write('<p><a target=\"_blank\" href=\"') | ||||
|                 f.write(href)   # 文章链接 | ||||
|                 f.write('\">') | ||||
|                 f.write(a_tag.get_text()) | ||||
|                 f.write('</a>  ') | ||||
|     time = article.find('time', {"class": "c-meta__item c-meta__item--block-at-lg"}).get_text() | ||||
|     f.write(time+'</p>') | ||||
| f.close() | ||||
|  | ||||
							
								
								
									
										36
									
								
								language_learning/2021.11.17_zhihu/physics_magazine.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								language_learning/2021.11.17_zhihu/physics_magazine.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| from bs4 import BeautifulSoup | ||||
| from urllib.request import urlopen | ||||
| import re   | ||||
| import datetime | ||||
|  | ||||
|  | ||||
| year = datetime.datetime.now().year | ||||
| month = datetime.datetime.now().month | ||||
| day = datetime.datetime.now().day | ||||
|  | ||||
| f = open('physics_magazine.html', 'w', encoding='UTF-8')  | ||||
| f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>') | ||||
| f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>') | ||||
|  | ||||
| match_href = [] | ||||
| start_link = "https://physics.aps.org/" | ||||
| html = urlopen(start_link).read().decode('utf-8')  # 打开网页 | ||||
| soup = BeautifulSoup(html, features='lxml') # 放入soup中 | ||||
| all_articles = soup.find_all('div', {"class":"feed-item-details"}) | ||||
| for article in all_articles: | ||||
|     all_a_tag = article.find_all('a', href=True)  # 获取超链接标签 | ||||
|     for a_tag in all_a_tag: | ||||
|         href = a_tag['href']  # 超链接字符串 | ||||
|         if re.search('/articles/', href): # 文章的链接 | ||||
|             if re.search('https://physics.aps.org', href)==None:  # 如果链接不是完整的,那么补充完整 | ||||
|                 href = 'https://physics.aps.org'+ href | ||||
|             if href not in match_href: | ||||
|                 match_href.append(href) | ||||
|                 f.write('<p><a target=\"_blank\" href=\"') | ||||
|                 f.write(href)   # 文章链接 | ||||
|                 f.write('\">') | ||||
|                 f.write(a_tag.get_text()) | ||||
|                 f.write('</a>  ') | ||||
|     time = article.find('time', {"class": "feed-item-date"}).get_text() | ||||
|     f.write(time+'</p>') | ||||
| f.close() | ||||
							
								
								
									
										42
									
								
								language_learning/2021.11.17_zhihu/prb.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								language_learning/2021.11.17_zhihu/prb.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | ||||
| from bs4 import BeautifulSoup | ||||
| from urllib.request import urlopen | ||||
| import re   | ||||
| import datetime | ||||
|  | ||||
|  | ||||
| year = datetime.datetime.now().year | ||||
| month = datetime.datetime.now().month | ||||
| day = datetime.datetime.now().day | ||||
|  | ||||
|  | ||||
| f = open('prb.html', 'w', encoding='UTF-8')  | ||||
| f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>') | ||||
| f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>') | ||||
|  | ||||
| match_href = [] | ||||
| for loop in range(1): | ||||
|     if loop == 0: | ||||
|         start_link = "https://journals.aps.org/prb/recent"  # 看第一页 | ||||
|     # elif loop == 1: | ||||
|     #     start_link = "https://journals.aps.org/prb/recent?page=2"  # 看第二页 | ||||
|     html = urlopen(start_link).read().decode('utf-8')  # 打开网页 | ||||
|     soup = BeautifulSoup(html, features='lxml') # 放入soup中 | ||||
|     all_article = soup.find_all('div', {"class":"article panel article-result"})  | ||||
|     for article in all_article: | ||||
|         all_a_tag = article.find_all('a', href=True)  # 获取超链接标签 | ||||
|         for a_tag in all_a_tag: | ||||
|             href = a_tag['href']  # 超链接字符串 | ||||
|             if re.search('/abstract/', href): # 文章的链接 | ||||
|                 if re.search('https://journals.aps.org', href)==None:  # 如果链接不是完整的,那么补充完整 | ||||
|                     href = 'https://journals.aps.org'+ href | ||||
|                 if href not in match_href and re.search('\?', href)==None:  # 链接不重复 | ||||
|                     match_href.append(href) | ||||
|                     f.write('<p><a target=\"_blank\" href=\"') | ||||
|                     f.write(href)   # 文章链接 | ||||
|                     f.write('\">') | ||||
|                     f.write(a_tag.get_text()) | ||||
|                     f.write('</a>  ') | ||||
|         info = article.find('h6', {"class": "pub-info"}).get_text() | ||||
|         f.write(re.findall('– Published .*', info, re.S)[0][12:]+'</p>') | ||||
| f.close() | ||||
|  | ||||
							
								
								
									
										42
									
								
								language_learning/2021.11.17_zhihu/prl.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								language_learning/2021.11.17_zhihu/prl.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | ||||
| from bs4 import BeautifulSoup | ||||
| from urllib.request import urlopen | ||||
| import re   | ||||
| import datetime | ||||
|  | ||||
|  | ||||
| year = datetime.datetime.now().year | ||||
| month = datetime.datetime.now().month | ||||
| day = datetime.datetime.now().day | ||||
|  | ||||
|  | ||||
| f = open('prl.html', 'w', encoding='UTF-8')  | ||||
| f.write('<meta charset="utf-8"><style type="text/css">a{text-decoration: none;color: #0a5794;}a:hover {text-decoration: underline;color: red; }</style>') | ||||
| f.write('<p>'+str(year)+'.'+str(month).rjust(2,'0')+'.'+str(day).rjust(2,'0')+' 已更新</p>') | ||||
|  | ||||
| match_href = [] | ||||
| for loop in range(1): | ||||
|     if loop == 0: | ||||
|         start_link = "https://journals.aps.org/prl/recent"  # 看第一页 | ||||
|     # elif loop == 1: | ||||
|     #     start_link = "https://journals.aps.org/prl/recent?page=2"  # 看第二页 | ||||
|     html = urlopen(start_link).read().decode('utf-8')  # 打开网页 | ||||
|     soup = BeautifulSoup(html, features='lxml') # 放入soup中 | ||||
|     all_article = soup.find_all('div', {"class":"article panel article-result"}) | ||||
|     for article in all_article: | ||||
|         all_a_tag = article.find_all('a', href=True)  # 获取超链接标签 | ||||
|         for a_tag in all_a_tag: | ||||
|             href = a_tag['href']  # 超链接字符串 | ||||
|             if re.search('/abstract/', href): # 文章的链接 | ||||
|                 if re.search('https://journals.aps.org', href)==None:  # 如果链接不是完整的,那么补充完整 | ||||
|                     href = 'https://journals.aps.org'+ href | ||||
|                 if href not in match_href and re.search('\?', href)==None:  # 链接不重复 | ||||
|                     match_href.append(href) | ||||
|                     f.write('<p><a target=\"_blank\" href=\"') | ||||
|                     f.write(href)   # 文章链接 | ||||
|                     f.write('\">') | ||||
|                     f.write(a_tag.get_text()) | ||||
|                     f.write('</a>  ') | ||||
|         info = article.find('h6', {"class": "pub-info"}).get_text() | ||||
|         f.write(re.findall('– Published.*', info, re.S)[0][12:]+'</p>') | ||||
| f.close() | ||||
|  | ||||
		Reference in New Issue
	
	Block a user