update
This commit is contained in:
@@ -1,47 +1,47 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
||||||
# 最简单的情况
|
# 最简单的情况
|
||||||
html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
|
html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
|
||||||
print('\n显示网页的代码信息1:\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息
|
print('\n显示网页的代码信息1:\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息
|
||||||
|
|
||||||
soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup
|
soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup
|
||||||
print('\n获取标签_标题h1_中的内容soup.h1:\n', soup.h1)
|
print('\n获取标签_标题h1_中的内容soup.h1:\n', soup.h1)
|
||||||
print('\n获取标签_段落p_中的内容soup.p:\n', soup.p)
|
print('\n获取标签_段落p_中的内容soup.p:\n', soup.p)
|
||||||
print('\n获取标签_链接a_中的内容soup.a:\n', soup.a)
|
print('\n获取标签_链接a_中的内容soup.a:\n', soup.a)
|
||||||
|
|
||||||
all_href = soup.find_all('a')
|
all_href = soup.find_all('a')
|
||||||
print('\n获取所有"a标签"的内容soup.find_all(‘a’):\n', all_href)
|
print('\n获取所有"a标签"的内容soup.find_all(‘a’):\n', all_href)
|
||||||
|
|
||||||
print('\n获取某个字典的值_1:')
|
print('\n获取某个字典的值_1:')
|
||||||
for a in all_href:
|
for a in all_href:
|
||||||
print(a)
|
print(a)
|
||||||
print(a['href'])
|
print(a['href'])
|
||||||
|
|
||||||
all_href = [a['href'] for a in all_href]
|
all_href = [a['href'] for a in all_href]
|
||||||
print('\n获取某个字典的值_2:\n', all_href, '\n')
|
print('\n获取某个字典的值_2:\n', all_href, '\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 加入CSS内容
|
# 加入CSS内容
|
||||||
html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8')
|
html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8')
|
||||||
print('\n显示网页的代码信息2:\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息
|
print('\n显示网页的代码信息2:\n\n ----------------开始----------------\n', html, '\n\n----------------结束----------------') # 显示网页的代码信息
|
||||||
|
|
||||||
soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup
|
soup = BeautifulSoup(html, features='lxml') # 把网页放进BeautifulSoup
|
||||||
|
|
||||||
print('\n利用class筛选出所需要的信息:')
|
print('\n利用class筛选出所需要的信息:')
|
||||||
month = soup.find_all('li', {"class": "month"})
|
month = soup.find_all('li', {"class": "month"})
|
||||||
print(month, '\n')
|
print(month, '\n')
|
||||||
|
|
||||||
print('只显示文本:')
|
print('只显示文本:')
|
||||||
for m in month:
|
for m in month:
|
||||||
print(m.get_text())
|
print(m.get_text())
|
||||||
|
|
||||||
print('\n 多次筛选:')
|
print('\n 多次筛选:')
|
||||||
january = soup.find('ul', {"class": 'jan'})
|
january = soup.find('ul', {"class": 'jan'})
|
||||||
print(january, '\n')
|
print(january, '\n')
|
||||||
d_january = january.find_all('li') # use january as a parent
|
d_january = january.find_all('li') # use january as a parent
|
||||||
print(d_january, '\n')
|
print(d_january, '\n')
|
||||||
for d in d_january:
|
for d in d_january:
|
||||||
print(d.get_text())
|
print(d.get_text())
|
@@ -1,45 +0,0 @@
|
|||||||
"""
|
|
||||||
This code is supported by the website: https://www.guanjihuan.com
|
|
||||||
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6846
|
|
||||||
"""
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from urllib.request import urlopen
|
|
||||||
import re # 正则模块
|
|
||||||
import requests
|
|
||||||
import os
|
|
||||||
# os.chdir('D:') # 设置文件保存的位置
|
|
||||||
|
|
||||||
|
|
||||||
# 输入
|
|
||||||
address_array = []
|
|
||||||
for i in range(10): # 最多一次性下载10篇
|
|
||||||
address = input('\n输入DOI/链接/标题:')
|
|
||||||
address_array.append(address)
|
|
||||||
continue_or_not = input('\n继续添加(1)/不继续添加(0):')
|
|
||||||
if int(continue_or_not) == 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
# 下载
|
|
||||||
for address in address_array:
|
|
||||||
r = requests.post('https://sci-hub.st/', data={'request': address})
|
|
||||||
print('\n响应结果是:', r)
|
|
||||||
print('访问的地址是:', r.url)
|
|
||||||
soup = BeautifulSoup(r.text, features='lxml')
|
|
||||||
pdf_URL = soup.iframe['src']
|
|
||||||
if re.search(re.compile('^https:'), pdf_URL):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
pdf_URL = 'https:'+pdf_URL
|
|
||||||
print('PDF的地址是:', pdf_URL)
|
|
||||||
name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::]
|
|
||||||
print('PDF文件名是:', name)
|
|
||||||
print('保存的位置在:', os.getcwd())
|
|
||||||
print('\n正在下载')
|
|
||||||
r = requests.get(pdf_URL, stream=True)
|
|
||||||
with open(name, 'wb') as f:
|
|
||||||
for chunk in r.iter_content(chunk_size=32):
|
|
||||||
f.write(chunk)
|
|
||||||
print('下载完成!')
|
|
||||||
|
|
||||||
print('\n全部下载完成!')
|
|
@@ -1,77 +1,77 @@
|
|||||||
"""
|
"""
|
||||||
This code is supported by the website: https://www.guanjihuan.com
|
This code is supported by the website: https://www.guanjihuan.com
|
||||||
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
|
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
os.chdir('D:/') # PDF文件存放的位置
|
os.chdir('D:/') # PDF文件存放的位置
|
||||||
filename = input('输入PDF文件名:')
|
filename = input('输入PDF文件名:')
|
||||||
pdfFile = open(filename+'.pdf','rb') # 打开PDF文件
|
pdfFile = open(filename+'.pdf','rb') # 打开PDF文件
|
||||||
links = all_links_in_pdf(pdfFile) # 获取PDF文件中的链接
|
links = all_links_in_pdf(pdfFile) # 获取PDF文件中的链接
|
||||||
pdfFile.close() # 关闭PDF文件
|
pdfFile.close() # 关闭PDF文件
|
||||||
os.chdir('D:/Reference') # 设置参考文献保存的位置
|
os.chdir('D:/Reference') # 设置参考文献保存的位置
|
||||||
download(links) # 下载文献
|
download(links) # 下载文献
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def all_links_in_pdf(pdfFile):
|
def all_links_in_pdf(pdfFile):
|
||||||
pdfReader = PyPDF2.PdfFileReader(pdfFile)
|
pdfReader = PyPDF2.PdfFileReader(pdfFile)
|
||||||
pages = pdfReader.getNumPages()
|
pages = pdfReader.getNumPages()
|
||||||
i0 = 0
|
i0 = 0
|
||||||
links = []
|
links = []
|
||||||
print()
|
print()
|
||||||
for page in range(pages):
|
for page in range(pages):
|
||||||
pageSliced = pdfReader.getPage(page)
|
pageSliced = pdfReader.getPage(page)
|
||||||
pageObject = pageSliced.getObject()
|
pageObject = pageSliced.getObject()
|
||||||
if '/Annots' in pageObject.keys():
|
if '/Annots' in pageObject.keys():
|
||||||
ann = pageObject['/Annots']
|
ann = pageObject['/Annots']
|
||||||
old = ''
|
old = ''
|
||||||
for a in ann:
|
for a in ann:
|
||||||
u = a.getObject()
|
u = a.getObject()
|
||||||
if '/A' in u.keys():
|
if '/A' in u.keys():
|
||||||
if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接
|
if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接
|
||||||
if u['/A']['/URI'] != old: # 排除重复链接
|
if u['/A']['/URI'] != old: # 排除重复链接
|
||||||
print(i0 , u['/A']['/URI'])
|
print(i0 , u['/A']['/URI'])
|
||||||
links.append(u['/A']['/URI']) # 链接存在link数组中
|
links.append(u['/A']['/URI']) # 链接存在link数组中
|
||||||
i0 += 1
|
i0 += 1
|
||||||
old = u['/A']['/URI']
|
old = u['/A']['/URI']
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def download(links):
|
def download(links):
|
||||||
for i0 in [0, 1, 3]: # 指定参考文献下载,如需全部下载用for i0 in range(links.shape[0]):
|
for i0 in [0, 1, 3]: # 指定参考文献下载,如需全部下载用for i0 in range(links.shape[0]):
|
||||||
address = links[i0]
|
address = links[i0]
|
||||||
r = requests.post('https://sci-hub.st/', data={'request': address})
|
r = requests.post('https://sci-hub.st/', data={'request': address})
|
||||||
print('\n响应结果是:', r)
|
print('\n响应结果是:', r)
|
||||||
print('访问的地址是:', r.url)
|
print('访问的地址是:', r.url)
|
||||||
soup = BeautifulSoup(r.text, features='lxml')
|
soup = BeautifulSoup(r.text, features='lxml')
|
||||||
pdf_URL = soup.iframe['src']
|
pdf_URL = soup.iframe['src']
|
||||||
if re.search(re.compile('^https:'), pdf_URL):
|
if re.search(re.compile('^https:'), pdf_URL):
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
pdf_URL = 'https:'+pdf_URL
|
pdf_URL = 'https:'+pdf_URL
|
||||||
print('PDF的地址是:', pdf_URL)
|
print('PDF的地址是:', pdf_URL)
|
||||||
name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::]
|
name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::]
|
||||||
print('PDF文件名是:', name)
|
print('PDF文件名是:', name)
|
||||||
print('保存的位置在:', os.getcwd())
|
print('保存的位置在:', os.getcwd())
|
||||||
print('\n正在下载第',i0,'篇')
|
print('\n正在下载第',i0,'篇')
|
||||||
r = requests.get(pdf_URL, stream=True)
|
r = requests.get(pdf_URL, stream=True)
|
||||||
with open(name, 'wb') as f:
|
with open(name, 'wb') as f:
|
||||||
for chunk in r.iter_content(chunk_size=32):
|
for chunk in r.iter_content(chunk_size=32):
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
print('第',i0,'篇下载完成!')
|
print('第',i0,'篇下载完成!')
|
||||||
print('\n全部下载完成!')
|
print('\n全部下载完成!')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
@@ -1,30 +1,30 @@
|
|||||||
"""
|
"""
|
||||||
This code is supported by the website: https://www.guanjihuan.com
|
This code is supported by the website: https://www.guanjihuan.com
|
||||||
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
|
The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
os.chdir('D:/') # PDF文件存放的位置
|
os.chdir('D:/') # PDF文件存放的位置
|
||||||
filename = input('输入PDF文件名:')
|
filename = input('输入PDF文件名:')
|
||||||
pdfFile = open(filename+'.pdf','rb')
|
pdfFile = open(filename+'.pdf','rb')
|
||||||
pdfReader = PyPDF2.PdfFileReader(pdfFile)
|
pdfReader = PyPDF2.PdfFileReader(pdfFile)
|
||||||
pages = pdfReader.getNumPages()
|
pages = pdfReader.getNumPages()
|
||||||
i0 = 0
|
i0 = 0
|
||||||
for page in range(pages):
|
for page in range(pages):
|
||||||
pageSliced = pdfReader.getPage(page)
|
pageSliced = pdfReader.getPage(page)
|
||||||
pageObject = pageSliced.getObject()
|
pageObject = pageSliced.getObject()
|
||||||
if '/Annots' in pageObject.keys():
|
if '/Annots' in pageObject.keys():
|
||||||
ann = pageObject['/Annots']
|
ann = pageObject['/Annots']
|
||||||
old = ''
|
old = ''
|
||||||
for a in ann:
|
for a in ann:
|
||||||
u = a.getObject()
|
u = a.getObject()
|
||||||
if '/A' in u.keys():
|
if '/A' in u.keys():
|
||||||
if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接
|
if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接
|
||||||
if u['/A']['/URI'] != old: # 排除重复链接
|
if u['/A']['/URI'] != old: # 排除重复链接
|
||||||
print(i0 , u['/A']['/URI'])
|
print(i0 , u['/A']['/URI'])
|
||||||
i0 += 1
|
i0 += 1
|
||||||
old = u['/A']['/URI']
|
old = u['/A']['/URI']
|
||||||
pdfFile.close()
|
pdfFile.close()
|
Reference in New Issue
Block a user