This commit is contained in:
guanjihuan
2021-08-09 12:45:46 +08:00
parent b101731c06
commit b115f2fb1d
4 changed files with 151 additions and 196 deletions

View File

@@ -1,47 +1,47 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.request import urlopen from urllib.request import urlopen
# 最简单的情况 # 最简单的情况
html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8') html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
print('\n显示网页的代码信息1\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息 print('\n显示网页的代码信息1\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息
soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup
print('\n获取标签_标题h1_中的内容soup.h1\n', soup.h1) print('\n获取标签_标题h1_中的内容soup.h1\n', soup.h1)
print('\n获取标签_段落p_中的内容soup.p\n', soup.p) print('\n获取标签_段落p_中的内容soup.p\n', soup.p)
print('\n获取标签_链接a_中的内容soup.a\n', soup.a) print('\n获取标签_链接a_中的内容soup.a\n', soup.a)
all_href = soup.find_all('a') all_href = soup.find_all('a')
print('\n获取所有"a标签"的内容soup.find_all(a)\n', all_href) print('\n获取所有"a标签"的内容soup.find_all(a)\n', all_href)
print('\n获取某个字典的值_1') print('\n获取某个字典的值_1')
for a in all_href: for a in all_href:
print(a) print(a)
print(a['href']) print(a['href'])
all_href = [a['href'] for a in all_href] all_href = [a['href'] for a in all_href]
print('\n获取某个字典的值_2\n', all_href, '\n') print('\n获取某个字典的值_2\n', all_href, '\n')
# 加入CSS内容 # 加入CSS内容
html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8') html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8')
print('\n显示网页的代码信息2\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息 print('\n显示网页的代码信息2\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息
soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup
print('\n利用class筛选出所需要的信息') print('\n利用class筛选出所需要的信息')
month = soup.find_all('li', {"class": "month"}) month = soup.find_all('li', {"class": "month"})
print(month, '\n') print(month, '\n')
print('只显示文本:') print('只显示文本:')
for m in month: for m in month:
print(m.get_text()) print(m.get_text())
print('\n 多次筛选:') print('\n 多次筛选:')
january = soup.find('ul', {"class": 'jan'}) january = soup.find('ul', {"class": 'jan'})
print(january, '\n') print(january, '\n')
d_january = january.find_all('li') # use january as a parent d_january = january.find_all('li') # use january as a parent
print(d_january, '\n') print(d_january, '\n')
for d in d_january: for d in d_january:
print(d.get_text()) print(d.get_text())

View File

@@ -1,45 +0,0 @@
"""
This code is supported by the website: https://www.guanjihuan.com
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6846
"""
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re # 正则模块
import requests
import os
# os.chdir('D:') # 设置文件保存的位置
# 输入
address_array = []
for i in range(10): # 最多一次性下载10篇
address = input('\n输入DOI/链接/标题:')
address_array.append(address)
continue_or_not = input('\n继续添加1/不继续添加0')
if int(continue_or_not) == 0:
break
# 下载
for address in address_array:
r = requests.post('https://sci-hub.st/', data={'request': address})
print('\n响应结果是:', r)
print('访问的地址是:', r.url)
soup = BeautifulSoup(r.text, features='lxml')
pdf_URL = soup.iframe['src']
if re.search(re.compile('^https:'), pdf_URL):
pass
else:
pdf_URL = 'https:'+pdf_URL
print('PDF的地址是', pdf_URL)
name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::]
print('PDF文件名是', name)
print('保存的位置在:', os.getcwd())
print('\n正在下载')
r = requests.get(pdf_URL, stream=True)
with open(name, 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
print('下载完成!')
print('\n全部下载完成!')

View File

@@ -1,77 +1,77 @@
""" """
This code is supported by the website: https://www.guanjihuan.com This code is supported by the website: https://www.guanjihuan.com
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869 The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
""" """
import PyPDF2 import PyPDF2
import os import os
import re import re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.request import urlopen from urllib.request import urlopen
import requests import requests
def main(): def main():
os.chdir('D:/') # PDF文件存放的位置 os.chdir('D:/') # PDF文件存放的位置
filename = input('输入PDF文件名') filename = input('输入PDF文件名')
pdfFile = open(filename+'.pdf','rb') # 打开PDF文件 pdfFile = open(filename+'.pdf','rb') # 打开PDF文件
links = all_links_in_pdf(pdfFile) # 获取PDF文件中的链接 links = all_links_in_pdf(pdfFile) # 获取PDF文件中的链接
pdfFile.close() # 关闭PDF文件 pdfFile.close() # 关闭PDF文件
os.chdir('D:/Reference') # 设置参考文献保存的位置 os.chdir('D:/Reference') # 设置参考文献保存的位置
download(links) # 下载文献 download(links) # 下载文献
def all_links_in_pdf(pdfFile): def all_links_in_pdf(pdfFile):
pdfReader = PyPDF2.PdfFileReader(pdfFile) pdfReader = PyPDF2.PdfFileReader(pdfFile)
pages = pdfReader.getNumPages() pages = pdfReader.getNumPages()
i0 = 0 i0 = 0
links = [] links = []
print() print()
for page in range(pages): for page in range(pages):
pageSliced = pdfReader.getPage(page) pageSliced = pdfReader.getPage(page)
pageObject = pageSliced.getObject() pageObject = pageSliced.getObject()
if '/Annots' in pageObject.keys(): if '/Annots' in pageObject.keys():
ann = pageObject['/Annots'] ann = pageObject['/Annots']
old = '' old = ''
for a in ann: for a in ann:
u = a.getObject() u = a.getObject()
if '/A' in u.keys(): if '/A' in u.keys():
if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接 if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接
if u['/A']['/URI'] != old: # 排除重复链接 if u['/A']['/URI'] != old: # 排除重复链接
print(i0 , u['/A']['/URI']) print(i0 , u['/A']['/URI'])
links.append(u['/A']['/URI']) # 链接存在link数组中 links.append(u['/A']['/URI']) # 链接存在link数组中
i0 += 1 i0 += 1
old = u['/A']['/URI'] old = u['/A']['/URI']
return links return links
def download(links): def download(links):
for i0 in [0, 1, 3]: # 指定参考文献下载如需全部下载用for i0 in range(links.shape[0]): for i0 in [0, 1, 3]: # 指定参考文献下载如需全部下载用for i0 in range(links.shape[0]):
address = links[i0] address = links[i0]
r = requests.post('https://sci-hub.st/', data={'request': address}) r = requests.post('https://sci-hub.st/', data={'request': address})
print('\n响应结果是:', r) print('\n响应结果是:', r)
print('访问的地址是:', r.url) print('访问的地址是:', r.url)
soup = BeautifulSoup(r.text, features='lxml') soup = BeautifulSoup(r.text, features='lxml')
pdf_URL = soup.iframe['src'] pdf_URL = soup.iframe['src']
if re.search(re.compile('^https:'), pdf_URL): if re.search(re.compile('^https:'), pdf_URL):
pass pass
else: else:
pdf_URL = 'https:'+pdf_URL pdf_URL = 'https:'+pdf_URL
print('PDF的地址是', pdf_URL) print('PDF的地址是', pdf_URL)
name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::] name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::]
print('PDF文件名是', name) print('PDF文件名是', name)
print('保存的位置在:', os.getcwd()) print('保存的位置在:', os.getcwd())
print('\n正在下载第',i0,'') print('\n正在下载第',i0,'')
r = requests.get(pdf_URL, stream=True) r = requests.get(pdf_URL, stream=True)
with open(name, 'wb') as f: with open(name, 'wb') as f:
for chunk in r.iter_content(chunk_size=32): for chunk in r.iter_content(chunk_size=32):
f.write(chunk) f.write(chunk)
print('',i0,'篇下载完成!') print('',i0,'篇下载完成!')
print('\n全部下载完成!') print('\n全部下载完成!')
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@@ -1,30 +1,30 @@
""" """
This code is supported by the website: https://www.guanjihuan.com This code is supported by the website: https://www.guanjihuan.com
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869 The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
""" """
import PyPDF2 import PyPDF2
import os import os
import re import re
os.chdir('D:/') # PDF文件存放的位置 os.chdir('D:/') # PDF文件存放的位置
filename = input('输入PDF文件名') filename = input('输入PDF文件名')
pdfFile = open(filename+'.pdf','rb') pdfFile = open(filename+'.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(pdfFile) pdfReader = PyPDF2.PdfFileReader(pdfFile)
pages = pdfReader.getNumPages() pages = pdfReader.getNumPages()
i0 = 0 i0 = 0
for page in range(pages): for page in range(pages):
pageSliced = pdfReader.getPage(page) pageSliced = pdfReader.getPage(page)
pageObject = pageSliced.getObject() pageObject = pageSliced.getObject()
if '/Annots' in pageObject.keys(): if '/Annots' in pageObject.keys():
ann = pageObject['/Annots'] ann = pageObject['/Annots']
old = '' old = ''
for a in ann: for a in ann:
u = a.getObject() u = a.getObject()
if '/A' in u.keys(): if '/A' in u.keys():
if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接 if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接
if u['/A']['/URI'] != old: # 排除重复链接 if u['/A']['/URI'] != old: # 排除重复链接
print(i0 , u['/A']['/URI']) print(i0 , u['/A']['/URI'])
i0 += 1 i0 += 1
old = u['/A']['/URI'] old = u['/A']['/URI']
pdfFile.close() pdfFile.close()