0.1.91
This commit is contained in:
parent
d17d4433f7
commit
505abf4da0
@ -1,7 +1,7 @@
|
|||||||
[metadata]
|
[metadata]
|
||||||
# replace with your username:
|
# replace with your username:
|
||||||
name = guan
|
name = guan
|
||||||
version = 0.1.90
|
version = 0.1.91
|
||||||
author = guanjihuan
|
author = guanjihuan
|
||||||
author_email = guanjihuan@163.com
|
author_email = guanjihuan@163.com
|
||||||
description = An open source python package
|
description = An open source python package
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
Metadata-Version: 2.1
|
Metadata-Version: 2.1
|
||||||
Name: guan
|
Name: guan
|
||||||
Version: 0.1.90
|
Version: 0.1.91
|
||||||
Summary: An open source python package
|
Summary: An open source python package
|
||||||
Home-page: https://py.guanjihuan.com
|
Home-page: https://py.guanjihuan.com
|
||||||
Author: guanjihuan
|
Author: guanjihuan
|
||||||
|
@ -751,24 +751,25 @@ def pdf_to_txt_for_a_specific_page(pdf_path, page_num=1):
|
|||||||
def get_links_from_pdf(pdf_path, link_starting_form=''):
|
def get_links_from_pdf(pdf_path, link_starting_form=''):
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
import re
|
import re
|
||||||
pdfReader = PyPDF2.PdfFileReader(pdf_path)
|
reader = PyPDF2.PdfReader(pdf_path)
|
||||||
pages = pdfReader.getNumPages()
|
pages = len(reader.pages)
|
||||||
i0 = 0
|
i0 = 0
|
||||||
links = []
|
links = []
|
||||||
for page in range(pages):
|
for page in range(pages):
|
||||||
pageSliced = pdfReader.getPage(page)
|
pageSliced = reader.pages[page]
|
||||||
pageObject = pageSliced.getObject()
|
pageObject = pageSliced.get_object()
|
||||||
if '/Annots' in pageObject.keys():
|
if '/Annots' in pageObject.keys():
|
||||||
ann = pageObject['/Annots']
|
ann = pageObject['/Annots']
|
||||||
old = ''
|
old = ''
|
||||||
for a in ann:
|
for a in ann:
|
||||||
u = a.getObject()
|
u = a.get_object()
|
||||||
if '/A' in u.keys():
|
if '/A' in u.keys():
|
||||||
if re.search(re.compile('^'+link_starting_form), u['/A']['/URI']):
|
if '/URI' in u['/A']:
|
||||||
if u['/A']['/URI'] != old:
|
if re.search(re.compile('^'+link_starting_form), u['/A']['/URI']):
|
||||||
links.append(u['/A']['/URI'])
|
if u['/A']['/URI'] != old:
|
||||||
i0 += 1
|
links.append(u['/A']['/URI'])
|
||||||
old = u['/A']['/URI']
|
i0 += 1
|
||||||
|
old = u['/A']['/URI']
|
||||||
return links
|
return links
|
||||||
|
|
||||||
# 通过Sci-Hub网站下载文献
|
# 通过Sci-Hub网站下载文献
|
||||||
|
Loading…
x
Reference in New Issue
Block a user