update

2021-08-09 12:45:46 +08:00
parent b101731c06
commit b115f2fb1d
4 changed files with 151 additions and 196 deletions
--- a/language_learning/2020.10.17_0_web_scraping_with_BeautifulSoup/BeautifulSoup.py
+++ b/language_learning/2020.10.17_0_web_scraping_with_BeautifulSoup/BeautifulSoup.py
@@ -1,47 +1,47 @@
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup
-from urllib.request import urlopen
+from urllib.request import urlopen
-
+
-# 最简单的情况
+# 最简单的情况
-html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
+html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
-print('\n显示网页的代码信息1：\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------')  # 显示网页的代码信息
+print('\n显示网页的代码信息1：\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------')  # 显示网页的代码信息
-
+
-soup = BeautifulSoup(html, features='lxml')   # 把网页放进BeautifulSoup
+soup = BeautifulSoup(html, features='lxml')   # 把网页放进BeautifulSoup
-print('\n获取标签_标题h1_中的内容soup.h1：\n', soup.h1)
+print('\n获取标签_标题h1_中的内容soup.h1：\n', soup.h1)
-print('\n获取标签_段落p_中的内容soup.p：\n', soup.p)
+print('\n获取标签_段落p_中的内容soup.p：\n', soup.p)
-print('\n获取标签_链接a_中的内容soup.a：\n', soup.a)
+print('\n获取标签_链接a_中的内容soup.a：\n', soup.a)
-
+
-all_href = soup.find_all('a')
+all_href = soup.find_all('a')
-print('\n获取所有"a标签"的内容soup.find_all(‘a’)：\n', all_href)
+print('\n获取所有"a标签"的内容soup.find_all(‘a’)：\n', all_href)
-
+
-print('\n获取某个字典的值_1：')
+print('\n获取某个字典的值_1：')
-for a in all_href:
+for a in all_href:
-    print(a)
+    print(a)
-    print(a['href'])
+    print(a['href'])
-
+
-all_href = [a['href'] for a in all_href]
+all_href = [a['href'] for a in all_href]
-print('\n获取某个字典的值_2：\n', all_href, '\n')
+print('\n获取某个字典的值_2：\n', all_href, '\n')
-
+
-
+
-
+
-
+
-# 加入CSS内容
+# 加入CSS内容
-html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8')
+html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8')
-print('\n显示网页的代码信息2：\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------')  # 显示网页的代码信息
+print('\n显示网页的代码信息2：\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------')  # 显示网页的代码信息
-
+
-soup = BeautifulSoup(html, features='lxml')  # 把网页放进BeautifulSoup
+soup = BeautifulSoup(html, features='lxml')  # 把网页放进BeautifulSoup
-
+
-print('\n利用class筛选出所需要的信息：')
+print('\n利用class筛选出所需要的信息：')
-month = soup.find_all('li', {"class": "month"})
+month = soup.find_all('li', {"class": "month"})
-print(month, '\n')
+print(month, '\n')
-
+
-print('只显示文本：')
+print('只显示文本：')
-for m in month:
+for m in month:
-    print(m.get_text())
+    print(m.get_text())
-
+
-print('\n 多次筛选：')
+print('\n 多次筛选：')
-january = soup.find('ul', {"class": 'jan'})
+january = soup.find('ul', {"class": 'jan'})
-print(january, '\n')
+print(january, '\n')
-d_january = january.find_all('li')  # use january as a parent
+d_january = january.find_all('li')  # use january as a parent
-print(d_january, '\n')
+print(d_january, '\n')
-for d in d_january:
+for d in d_january:
    print(d.get_text())
--- a/language_learning/2020.10.31_0_download_from_sci_hub_with_python/download_from_sci_hub_with_python.py
+++ b/language_learning/2020.10.31_0_download_from_sci_hub_with_python/download_from_sci_hub_with_python.py
@@ -1,45 +0,0 @@
 """
 This code is supported by the website: https://www.guanjihuan.com
 The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6846
 """
 from bs4 import BeautifulSoup
 from urllib.request import urlopen
 import re  # 正则模块
 import requests
 import os
 # os.chdir('D:')  # 设置文件保存的位置
 # 输入
 address_array = []
 for i in range(10):  # 最多一次性下载10篇
    address = input('\n输入DOI/链接/标题：')
    address_array.append(address)
    continue_or_not = input('\n继续添加（1）/不继续添加（0）：')
    if int(continue_or_not) == 0:
        break
 # 下载
 for address in address_array:
    r = requests.post('https://sci-hub.st/', data={'request': address})
    print('\n响应结果是：', r)
    print('访问的地址是：', r.url)
    soup = BeautifulSoup(r.text, features='lxml')
    pdf_URL = soup.iframe['src']
    if re.search(re.compile('^https:'), pdf_URL):
        pass
    else:
        pdf_URL = 'https:'+pdf_URL
    print('PDF的地址是：', pdf_URL)
    name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::]
    print('PDF文件名是：', name)
    print('保存的位置在：', os.getcwd())
    print('\n正在下载')
    r = requests.get(pdf_URL, stream=True)
    with open(name, 'wb') as f:
        for chunk in r.iter_content(chunk_size=32):
            f.write(chunk)
    print('下载完成！')
 print('\n全部下载完成！')
--- a/language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py
+++ b/language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py
@@ -1,77 +1,77 @@
-"""
+"""
-This code is supported by the website: https://www.guanjihuan.com
+This code is supported by the website: https://www.guanjihuan.com
-The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
+The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
-"""
+"""
-
+
-import PyPDF2
+import PyPDF2
-import os
+import os
-import re 
+import re 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup
-from urllib.request import urlopen
+from urllib.request import urlopen
-import requests
+import requests
-
+
-
+
-
+
-def main():
+def main():
-    os.chdir('D:/')  # PDF文件存放的位置
+    os.chdir('D:/')  # PDF文件存放的位置
-    filename = input('输入PDF文件名：')
+    filename = input('输入PDF文件名：')
-    pdfFile = open(filename+'.pdf','rb')  # 打开PDF文件
+    pdfFile = open(filename+'.pdf','rb')  # 打开PDF文件
-    links = all_links_in_pdf(pdfFile)  # 获取PDF文件中的链接
+    links = all_links_in_pdf(pdfFile)  # 获取PDF文件中的链接
-    pdfFile.close()  # 关闭PDF文件
+    pdfFile.close()  # 关闭PDF文件
-    os.chdir('D:/Reference')  # 设置参考文献保存的位置
+    os.chdir('D:/Reference')  # 设置参考文献保存的位置
-    download(links)  # 下载文献
+    download(links)  # 下载文献
-
+
-
+
-
+
-def all_links_in_pdf(pdfFile): 
+def all_links_in_pdf(pdfFile): 
-    pdfReader = PyPDF2.PdfFileReader(pdfFile)
+    pdfReader = PyPDF2.PdfFileReader(pdfFile)
-    pages = pdfReader.getNumPages()
+    pages = pdfReader.getNumPages()
-    i0 = 0
+    i0 = 0
-    links = []
+    links = []
-    print()
+    print()
-    for page in range(pages):
+    for page in range(pages):
-        pageSliced = pdfReader.getPage(page)
+        pageSliced = pdfReader.getPage(page)
-        pageObject = pageSliced.getObject()
+        pageObject = pageSliced.getObject()
-        if '/Annots' in pageObject.keys():
+        if '/Annots' in pageObject.keys():
-            ann = pageObject['/Annots']
+            ann = pageObject['/Annots']
-            old = ''
+            old = ''
-            for a in ann:
+            for a in ann:
-                u = a.getObject()
+                u = a.getObject()
-                if '/A' in u.keys():
+                if '/A' in u.keys():
-                    if re.search(re.compile('^https://doi.org'), u['/A']['/URI']):   # 排除其他形式的链接
+                    if re.search(re.compile('^https://doi.org'), u['/A']['/URI']):   # 排除其他形式的链接
-                        if u['/A']['/URI'] != old: # 排除重复链接
+                        if u['/A']['/URI'] != old: # 排除重复链接
-                            print(i0 , u['/A']['/URI'])
+                            print(i0 , u['/A']['/URI'])
-                            links.append(u['/A']['/URI']) # 链接存在link数组中 
+                            links.append(u['/A']['/URI']) # 链接存在link数组中 
-                            i0 += 1
+                            i0 += 1
-                            old = u['/A']['/URI']        
+                            old = u['/A']['/URI']        
-    return links
+    return links
-
+
-
+
-
+
-def download(links):
+def download(links):
-    for i0 in [0, 1, 3]:  # 指定参考文献下载，如需全部下载用for i0 in range(links.shape[0]):
+    for i0 in [0, 1, 3]:  # 指定参考文献下载，如需全部下载用for i0 in range(links.shape[0]):
-        address = links[i0]
+        address = links[i0]
-        r = requests.post('https://sci-hub.st/', data={'request': address})
+        r = requests.post('https://sci-hub.st/', data={'request': address})
-        print('\n响应结果是：', r)
+        print('\n响应结果是：', r)
-        print('访问的地址是：', r.url)
+        print('访问的地址是：', r.url)
-        soup = BeautifulSoup(r.text, features='lxml')
+        soup = BeautifulSoup(r.text, features='lxml')
-        pdf_URL = soup.iframe['src']
+        pdf_URL = soup.iframe['src']
-        if re.search(re.compile('^https:'), pdf_URL):
+        if re.search(re.compile('^https:'), pdf_URL):
-            pass
+            pass
-        else:
+        else:
-            pdf_URL = 'https:'+pdf_URL
+            pdf_URL = 'https:'+pdf_URL
-        print('PDF的地址是：', pdf_URL)
+        print('PDF的地址是：', pdf_URL)
-        name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::]
+        name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::]
-        print('PDF文件名是：', name)
+        print('PDF文件名是：', name)
-        print('保存的位置在：', os.getcwd())
+        print('保存的位置在：', os.getcwd())
-        print('\n正在下载第',i0,'篇')
+        print('\n正在下载第',i0,'篇')
-        r = requests.get(pdf_URL, stream=True)
+        r = requests.get(pdf_URL, stream=True)
-        with open(name, 'wb') as f:
+        with open(name, 'wb') as f:
-            for chunk in r.iter_content(chunk_size=32):
+            for chunk in r.iter_content(chunk_size=32):
-                f.write(chunk)
+                f.write(chunk)
-        print('第',i0,'篇下载完成！')
+        print('第',i0,'篇下载完成！')
-    print('\n全部下载完成！')
+    print('\n全部下载完成！')
-
+
-
+
-if __name__ == '__main__':
+if __name__ == '__main__':
    main()
--- a/language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py
+++ b/language_learning/2020.10.31_1_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py
@@ -1,30 +1,30 @@
-"""
+"""
-This code is supported by the website: https://www.guanjihuan.com
+This code is supported by the website: https://www.guanjihuan.com
-The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
+The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
-"""
+"""
-
+
-import PyPDF2
+import PyPDF2
-import os
+import os
-import re 
+import re 
-
+
-os.chdir('D:/')  # PDF文件存放的位置
+os.chdir('D:/')  # PDF文件存放的位置
-filename = input('输入PDF文件名：')
+filename = input('输入PDF文件名：')
-pdfFile = open(filename+'.pdf','rb')
+pdfFile = open(filename+'.pdf','rb')
-pdfReader = PyPDF2.PdfFileReader(pdfFile)
+pdfReader = PyPDF2.PdfFileReader(pdfFile)
-pages = pdfReader.getNumPages()
+pages = pdfReader.getNumPages()
-i0 = 0
+i0 = 0
-for page in range(pages):
+for page in range(pages):
-    pageSliced = pdfReader.getPage(page)
+    pageSliced = pdfReader.getPage(page)
-    pageObject = pageSliced.getObject()
+    pageObject = pageSliced.getObject()
-    if '/Annots' in pageObject.keys():
+    if '/Annots' in pageObject.keys():
-        ann = pageObject['/Annots']
+        ann = pageObject['/Annots']
-        old = ''
+        old = ''
-        for a in ann:
+        for a in ann:
-            u = a.getObject()
+            u = a.getObject()
-            if '/A' in u.keys():
+            if '/A' in u.keys():
-                if re.search(re.compile('^https://doi.org'), u['/A']['/URI']):   # 排除其他形式的链接
+                if re.search(re.compile('^https://doi.org'), u['/A']['/URI']):   # 排除其他形式的链接
-                    if u['/A']['/URI'] != old: # 排除重复链接
+                    if u['/A']['/URI'] != old: # 排除重复链接
-                        print(i0 , u['/A']['/URI'])
+                        print(i0 , u['/A']['/URI'])
-                        i0 += 1
+                        i0 += 1
-                        old = u['/A']['/URI']        
+                        old = u['/A']['/URI']        
 pdfFile.close()