From b115f2fb1d54ec6f4920ef7a42c999e04cb58070 Mon Sep 17 00:00:00 2001 From: guanjihuan <34735497+guanjihuan@users.noreply.github.com> Date: Mon, 9 Aug 2021 12:45:46 +0800 Subject: [PATCH] update --- .../BeautifulSoup.py | 92 +++++------ .../download_from_sci_hub_with_python.py | 45 ------ ...ad_references_in_a_pdf_file_with_python.py | 152 +++++++++--------- .../get_links_from_a_pdf_file.py | 58 +++---- 4 files changed, 151 insertions(+), 196 deletions(-) rename language_learning/{2020.10.17_0_web_scraping_with_BeautifulSoup => 2020.10.17_web_scraping_with_BeautifulSoup}/BeautifulSoup.py (97%) mode change 100755 => 100644 delete mode 100755 language_learning/2020.10.31_0_download_from_sci_hub_with_python/download_from_sci_hub_with_python.py rename language_learning/{2020.10.31_1_download_references_in_a_pdf_file_with_python => 2020.10.31_download_references_in_a_pdf_file_with_python}/download_references_in_a_pdf_file_with_python.py (97%) mode change 100755 => 100644 rename language_learning/{2020.10.31_1_download_references_in_a_pdf_file_with_python => 2020.10.31_download_references_in_a_pdf_file_with_python}/get_links_from_a_pdf_file.py (97%) mode change 100755 => 100644 diff --git a/language_learning/2020.10.17_0_web_scraping_with_BeautifulSoup/BeautifulSoup.py b/language_learning/2020.10.17_web_scraping_with_BeautifulSoup/BeautifulSoup.py old mode 100755 new mode 100644 similarity index 97% rename from language_learning/2020.10.17_0_web_scraping_with_BeautifulSoup/BeautifulSoup.py rename to language_learning/2020.10.17_web_scraping_with_BeautifulSoup/BeautifulSoup.py index be06dca..9e74a88 --- a/language_learning/2020.10.17_0_web_scraping_with_BeautifulSoup/BeautifulSoup.py +++ b/language_learning/2020.10.17_web_scraping_with_BeautifulSoup/BeautifulSoup.py @@ -1,47 +1,47 @@ -from bs4 import BeautifulSoup -from urllib.request import urlopen - -# 最简单的情况 -html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8') -print('\n显示网页的代码信息1:\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息 - -soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup -print('\n获取标签_标题h1_中的内容soup.h1:\n', soup.h1) -print('\n获取标签_段落p_中的内容soup.p:\n', soup.p) -print('\n获取标签_链接a_中的内容soup.a:\n', soup.a) - -all_href = soup.find_all('a') -print('\n获取所有"a标签"的内容soup.find_all(‘a’):\n', all_href) - -print('\n获取某个字典的值_1:') -for a in all_href: - print(a) - print(a['href']) - -all_href = [a['href'] for a in all_href] -print('\n获取某个字典的值_2:\n', all_href, '\n') - - - - -# 加入CSS内容 -html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8') -print('\n显示网页的代码信息2:\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息 - -soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup - -print('\n利用class筛选出所需要的信息:') -month = soup.find_all('li', {"class": "month"}) -print(month, '\n') - -print('只显示文本:') -for m in month: - print(m.get_text()) - -print('\n 多次筛选:') -january = soup.find('ul', {"class": 'jan'}) -print(january, '\n') -d_january = january.find_all('li') # use january as a parent -print(d_january, '\n') -for d in d_january: +from bs4 import BeautifulSoup +from urllib.request import urlopen + +# 最简单的情况 +html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8') +print('\n显示网页的代码信息1:\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息 + +soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup +print('\n获取标签_标题h1_中的内容soup.h1:\n', soup.h1) +print('\n获取标签_段落p_中的内容soup.p:\n', soup.p) +print('\n获取标签_链接a_中的内容soup.a:\n', soup.a) + +all_href = soup.find_all('a') +print('\n获取所有"a标签"的内容soup.find_all(‘a’):\n', all_href) + +print('\n获取某个字典的值_1:') +for a in all_href: + print(a) + print(a['href']) + +all_href = [a['href'] for a in all_href] +print('\n获取某个字典的值_2:\n', all_href, '\n') + + + + +# 加入CSS内容 +html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8') +print('\n显示网页的代码信息2:\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息 + +soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup + +print('\n利用class筛选出所需要的信息:') +month = soup.find_all('li', {"class": "month"}) +print(month, '\n') + +print('只显示文本:') +for m in month: + print(m.get_text()) + +print('\n 多次筛选:') +january = soup.find('ul', {"class": 'jan'}) +print(january, '\n') +d_january = january.find_all('li') # use january as a parent +print(d_january, '\n') +for d in d_january: print(d.get_text()) \ No newline at end of file diff --git a/language_learning/2020.10.31_0_download_from_sci_hub_with_python/download_from_sci_hub_with_python.py b/language_learning/2020.10.31_0_download_from_sci_hub_with_python/download_from_sci_hub_with_python.py deleted file mode 100755 index 8c0e303..0000000 --- a/language_learning/2020.10.31_0_download_from_sci_hub_with_python/download_from_sci_hub_with_python.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6846 -""" - -from bs4 import BeautifulSoup -from urllib.request import urlopen -import re # 正则模块 -import requests -import os -# os.chdir('D:') # 设置文件保存的位置 - - -# 输入 -address_array = [] -for i in range(10): # 最多一次性下载10篇 - address = input('\n输入DOI/链接/标题:') - address_array.append(address) - continue_or_not = input('\n继续添加(1)/不继续添加(0):') - if int(continue_or_not) == 0: - break - -# 下载 -for address in address_array: - r = requests.post('https://sci-hub.st/', data={'request': address}) - print('\n响应结果是:', r) - print('访问的地址是:', r.url) - soup = BeautifulSoup(r.text, features='lxml') - pdf_URL = soup.iframe['src'] - if re.search(re.compile('^https:'), pdf_URL): - pass - else: - pdf_URL = 'https:'+pdf_URL - print('PDF的地址是:', pdf_URL) - name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::] - print('PDF文件名是:', name) - print('保存的位置在:', os.getcwd()) - print('\n正在下载') - r = requests.get(pdf_URL, stream=True) - with open(name, 'wb') as f: - for chunk in r.iter_content(chunk_size=32): - f.write(chunk) - print('下载完成!') - -print('\n全部下载完成!') \ No newline at end of file diff --git a/language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py b/language_learning/2020.10.31_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py old mode 100755 new mode 100644 similarity index 97% rename from language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py rename to language_learning/2020.10.31_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py index a0f280e..8317d04 --- a/language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py +++ b/language_learning/2020.10.31_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py @@ -1,77 +1,77 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869 -""" - -import PyPDF2 -import os -import re -from bs4 import BeautifulSoup -from urllib.request import urlopen -import requests - - - -def main(): - os.chdir('D:/') # PDF文件存放的位置 - filename = input('输入PDF文件名:') - pdfFile = open(filename+'.pdf','rb') # 打开PDF文件 - links = all_links_in_pdf(pdfFile) # 获取PDF文件中的链接 - pdfFile.close() # 关闭PDF文件 - os.chdir('D:/Reference') # 设置参考文献保存的位置 - download(links) # 下载文献 - - - -def all_links_in_pdf(pdfFile): - pdfReader = PyPDF2.PdfFileReader(pdfFile) - pages = pdfReader.getNumPages() - i0 = 0 - links = [] - print() - for page in range(pages): - pageSliced = pdfReader.getPage(page) - pageObject = pageSliced.getObject() - if '/Annots' in pageObject.keys(): - ann = pageObject['/Annots'] - old = '' - for a in ann: - u = a.getObject() - if '/A' in u.keys(): - if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接 - if u['/A']['/URI'] != old: # 排除重复链接 - print(i0 , u['/A']['/URI']) - links.append(u['/A']['/URI']) # 链接存在link数组中 - i0 += 1 - old = u['/A']['/URI'] - return links - - - -def download(links): - for i0 in [0, 1, 3]: # 指定参考文献下载,如需全部下载用for i0 in range(links.shape[0]): - address = links[i0] - r = requests.post('https://sci-hub.st/', data={'request': address}) - print('\n响应结果是:', r) - print('访问的地址是:', r.url) - soup = BeautifulSoup(r.text, features='lxml') - pdf_URL = soup.iframe['src'] - if re.search(re.compile('^https:'), pdf_URL): - pass - else: - pdf_URL = 'https:'+pdf_URL - print('PDF的地址是:', pdf_URL) - name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::] - print('PDF文件名是:', name) - print('保存的位置在:', os.getcwd()) - print('\n正在下载第',i0,'篇') - r = requests.get(pdf_URL, stream=True) - with open(name, 'wb') as f: - for chunk in r.iter_content(chunk_size=32): - f.write(chunk) - print('第',i0,'篇下载完成!') - print('\n全部下载完成!') - - -if __name__ == '__main__': +""" +This code is supported by the website: https://www.guanjihuan.com +The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869 +""" + +import PyPDF2 +import os +import re +from bs4 import BeautifulSoup +from urllib.request import urlopen +import requests + + + +def main(): + os.chdir('D:/') # PDF文件存放的位置 + filename = input('输入PDF文件名:') + pdfFile = open(filename+'.pdf','rb') # 打开PDF文件 + links = all_links_in_pdf(pdfFile) # 获取PDF文件中的链接 + pdfFile.close() # 关闭PDF文件 + os.chdir('D:/Reference') # 设置参考文献保存的位置 + download(links) # 下载文献 + + + +def all_links_in_pdf(pdfFile): + pdfReader = PyPDF2.PdfFileReader(pdfFile) + pages = pdfReader.getNumPages() + i0 = 0 + links = [] + print() + for page in range(pages): + pageSliced = pdfReader.getPage(page) + pageObject = pageSliced.getObject() + if '/Annots' in pageObject.keys(): + ann = pageObject['/Annots'] + old = '' + for a in ann: + u = a.getObject() + if '/A' in u.keys(): + if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接 + if u['/A']['/URI'] != old: # 排除重复链接 + print(i0 , u['/A']['/URI']) + links.append(u['/A']['/URI']) # 链接存在link数组中 + i0 += 1 + old = u['/A']['/URI'] + return links + + + +def download(links): + for i0 in [0, 1, 3]: # 指定参考文献下载,如需全部下载用for i0 in range(links.shape[0]): + address = links[i0] + r = requests.post('https://sci-hub.st/', data={'request': address}) + print('\n响应结果是:', r) + print('访问的地址是:', r.url) + soup = BeautifulSoup(r.text, features='lxml') + pdf_URL = soup.iframe['src'] + if re.search(re.compile('^https:'), pdf_URL): + pass + else: + pdf_URL = 'https:'+pdf_URL + print('PDF的地址是:', pdf_URL) + name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::] + print('PDF文件名是:', name) + print('保存的位置在:', os.getcwd()) + print('\n正在下载第',i0,'篇') + r = requests.get(pdf_URL, stream=True) + with open(name, 'wb') as f: + for chunk in r.iter_content(chunk_size=32): + f.write(chunk) + print('第',i0,'篇下载完成!') + print('\n全部下载完成!') + + +if __name__ == '__main__': main() \ No newline at end of file diff --git a/language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py b/language_learning/2020.10.31_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py old mode 100755 new mode 100644 similarity index 97% rename from language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py rename to language_learning/2020.10.31_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py index 5e6a155..94ca406 --- a/language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py +++ b/language_learning/2020.10.31_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py @@ -1,30 +1,30 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869 -""" - -import PyPDF2 -import os -import re - -os.chdir('D:/') # PDF文件存放的位置 -filename = input('输入PDF文件名:') -pdfFile = open(filename+'.pdf','rb') -pdfReader = PyPDF2.PdfFileReader(pdfFile) -pages = pdfReader.getNumPages() -i0 = 0 -for page in range(pages): - pageSliced = pdfReader.getPage(page) - pageObject = pageSliced.getObject() - if '/Annots' in pageObject.keys(): - ann = pageObject['/Annots'] - old = '' - for a in ann: - u = a.getObject() - if '/A' in u.keys(): - if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接 - if u['/A']['/URI'] != old: # 排除重复链接 - print(i0 , u['/A']['/URI']) - i0 += 1 - old = u['/A']['/URI'] +""" +This code is supported by the website: https://www.guanjihuan.com +The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869 +""" + +import PyPDF2 +import os +import re + +os.chdir('D:/') # PDF文件存放的位置 +filename = input('输入PDF文件名:') +pdfFile = open(filename+'.pdf','rb') +pdfReader = PyPDF2.PdfFileReader(pdfFile) +pages = pdfReader.getNumPages() +i0 = 0 +for page in range(pages): + pageSliced = pdfReader.getPage(page) + pageObject = pageSliced.getObject() + if '/Annots' in pageObject.keys(): + ann = pageObject['/Annots'] + old = '' + for a in ann: + u = a.getObject() + if '/A' in u.keys(): + if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接 + if u['/A']['/URI'] != old: # 排除重复链接 + print(i0 , u['/A']['/URI']) + i0 += 1 + old = u['/A']['/URI'] pdfFile.close() \ No newline at end of file