This commit is contained in:
guanjihuan 2024-03-13 02:45:53 +08:00
parent d17d4433f7
commit 505abf4da0
3 changed files with 13 additions and 12 deletions

View File

@ -1,7 +1,7 @@
[metadata] [metadata]
# replace with your username: # replace with your username:
name = guan name = guan
version = 0.1.90 version = 0.1.91
author = guanjihuan author = guanjihuan
author_email = guanjihuan@163.com author_email = guanjihuan@163.com
description = An open source python package description = An open source python package

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.1 Metadata-Version: 2.1
Name: guan Name: guan
Version: 0.1.90 Version: 0.1.91
Summary: An open source python package Summary: An open source python package
Home-page: https://py.guanjihuan.com Home-page: https://py.guanjihuan.com
Author: guanjihuan Author: guanjihuan

View File

@ -751,19 +751,20 @@ def pdf_to_txt_for_a_specific_page(pdf_path, page_num=1):
def get_links_from_pdf(pdf_path, link_starting_form=''): def get_links_from_pdf(pdf_path, link_starting_form=''):
import PyPDF2 import PyPDF2
import re import re
pdfReader = PyPDF2.PdfFileReader(pdf_path) reader = PyPDF2.PdfReader(pdf_path)
pages = pdfReader.getNumPages() pages = len(reader.pages)
i0 = 0 i0 = 0
links = [] links = []
for page in range(pages): for page in range(pages):
pageSliced = pdfReader.getPage(page) pageSliced = reader.pages[page]
pageObject = pageSliced.getObject() pageObject = pageSliced.get_object()
if '/Annots' in pageObject.keys(): if '/Annots' in pageObject.keys():
ann = pageObject['/Annots'] ann = pageObject['/Annots']
old = '' old = ''
for a in ann: for a in ann:
u = a.getObject() u = a.get_object()
if '/A' in u.keys(): if '/A' in u.keys():
if '/URI' in u['/A']:
if re.search(re.compile('^'+link_starting_form), u['/A']['/URI']): if re.search(re.compile('^'+link_starting_form), u['/A']['/URI']):
if u['/A']['/URI'] != old: if u['/A']['/URI'] != old:
links.append(u['/A']['/URI']) links.append(u['/A']['/URI'])