This commit is contained in:
guanjihuan 2024-10-30 22:51:59 +08:00
parent 766b931242
commit ab25e2175b
3 changed files with 56 additions and 2 deletions

View File

@ -1,7 +1,7 @@
[metadata] [metadata]
# replace with your username: # replace with your username:
name = guan name = guan
version = 0.1.121 version = 0.1.122
author = guanjihuan author = guanjihuan
author_email = guanjihuan@163.com author_email = guanjihuan@163.com
description = An open source python package description = An open source python package

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.1 Metadata-Version: 2.1
Name: guan Name: guan
Version: 0.1.121 Version: 0.1.122
Summary: An open source python package Summary: An open source python package
Home-page: https://py.guanjihuan.com Home-page: https://py.guanjihuan.com
Author: guanjihuan Author: guanjihuan

View File

@ -694,6 +694,60 @@ def get_html_from_tags(link, tags=['title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
content = content + '\n\n' + text content = content + '\n\n' + text
return content return content
# 从HTML中获取所有的链接
def get_links_from_html(html_link, links_with_text=0):
from bs4 import BeautifulSoup
import urllib.request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
html = urllib.request.urlopen(html_link).read().decode('utf-8')
soup = BeautifulSoup(html, features="lxml")
a_tags = soup.find_all('a')
if links_with_text == 0:
link_array = [tag.get('href') for tag in a_tags if tag.get('href')]
return link_array
else:
link_array_with_text = [(tag.get('href'), tag.text) for tag in a_tags if tag.get('href')]
return link_array_with_text
# 检查链接的有效性
def check_link(url, timeout=3, allow_redirects=True):
import requests
try:
response = requests.head(url, timeout=timeout, allow_redirects=allow_redirects)
if response.status_code == 200:
return True
else:
return False
except requests.exceptions.RequestException:
return False
# 检查链接数组中链接的有效性
def check_link_array(link_array, timeout=3, allow_redirects=True, try_again=0, print_show=1):
import guan
failed_link_array0 = []
for link in link_array:
if link=='#' or guan.check_link(link, timeout=timeout, allow_redirects=allow_redirects):
pass
else:
failed_link_array0.append(link)
if print_show:
print(link)
failed_link_array = []
if try_again:
if print_show:
print('\nTry again:\n')
for link in failed_link_array0:
if link=='#' or guan.check_link(link, timeout=timeout, allow_redirects=allow_redirects):
pass
else:
failed_link_array.append(link)
if print_show:
print(link)
else:
failed_link_array = failed_link_array0
return failed_link_array
# 生成二维码 # 生成二维码
def creat_qrcode(data="https://www.guanjihuan.com", filename='a', file_format='.png'): def creat_qrcode(data="https://www.guanjihuan.com", filename='a', file_format='.png'):
import qrcode import qrcode