# Module: others # 获取软件包的本机版本 def get_current_version(package_name='guan'): import importlib.metadata try: current_version = importlib.metadata.version(package_name) return current_version except: return None # 获取Python软件包的最新版本 def get_latest_version(package_name='guan', timeout=5): import requests url = f"https://pypi.org/pypi/{package_name}/json" try: response = requests.get(url, timeout=timeout) except: return None if response.status_code == 200: data = response.json() latest_version = data["info"]["version"] return latest_version else: return None # 获取包含某个字符的进程PID值 def get_PID_array(name): import subprocess command = "ps -ef | grep "+name result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if result.returncode == 0: ps_ef = result.stdout import re ps_ef_1 = re.split(r'\n', ps_ef) id_running_array = [] for ps_ef_item in ps_ef_1: if ps_ef_item != '': ps_ef_2 = re.split(r'\s+', ps_ef_item) id_running_array.append(ps_ef_2[1]) return id_running_array # 每日git commit次数的统计 def statistics_of_git_commits(print_show=0, str_or_datetime='str'): import subprocess import collections since_date = '100 year ago' result = subprocess.run( ['git', 'log', f'--since={since_date}', '--pretty=format:%ad', '--date=short'], stdout=subprocess.PIPE, text=True) commits = result.stdout.strip().split('\n') counter = collections.Counter(commits) daily_commit_counts = dict(sorted(counter.items())) date_array = [] commit_count_array = [] for date, count in daily_commit_counts.items(): if print_show == 1: print(f"{date}: {count} commits") if str_or_datetime=='datetime': import datetime date_array.append(datetime.datetime.strptime(date, "%Y-%m-%d")) elif str_or_datetime=='str': date_array.append(date) commit_count_array.append(count) return date_array, commit_count_array # 将文件目录结构写入Markdown文件 def write_file_list_in_markdown(directory='./', filename='a', reverse_positive_or_negative=1, starting_from_h1=None, banned_file_format=[], hide_file_format=None, divided_line=None, show_second_number=None, show_third_number=None): import os f = open(filename+'.md', 'w', encoding="utf-8") filenames1 = os.listdir(directory) u0 = 0 for filename1 in filenames1[::reverse_positive_or_negative]: filename1_with_path = os.path.join(directory,filename1) if os.path.isfile(filename1_with_path): if os.path.splitext(filename1)[1] not in banned_file_format: if hide_file_format == None: f.write('+ '+str(filename1)+'\n\n') else: f.write('+ '+str(os.path.splitext(filename1)[0])+'\n\n') else: u0 += 1 if divided_line != None and u0 != 1: f.write('--------\n\n') if starting_from_h1 == None: f.write('#') f.write('# '+str(filename1)+'\n\n') filenames2 = os.listdir(filename1_with_path) i0 = 0 for filename2 in filenames2[::reverse_positive_or_negative]: filename2_with_path = os.path.join(directory, filename1, filename2) if os.path.isfile(filename2_with_path): if os.path.splitext(filename2)[1] not in banned_file_format: if hide_file_format == None: f.write('+ '+str(filename2)+'\n\n') else: f.write('+ '+str(os.path.splitext(filename2)[0])+'\n\n') else: i0 += 1 if starting_from_h1 == None: f.write('#') if show_second_number != None: f.write('## '+str(i0)+'. '+str(filename2)+'\n\n') else: f.write('## '+str(filename2)+'\n\n') j0 = 0 filenames3 = os.listdir(filename2_with_path) for filename3 in filenames3[::reverse_positive_or_negative]: filename3_with_path = os.path.join(directory, filename1, filename2, filename3) if os.path.isfile(filename3_with_path): if os.path.splitext(filename3)[1] not in banned_file_format: if hide_file_format == None: f.write('+ '+str(filename3)+'\n\n') else: f.write('+ '+str(os.path.splitext(filename3)[0])+'\n\n') else: j0 += 1 if starting_from_h1 == None: f.write('#') if show_third_number != None: f.write('### ('+str(j0)+') '+str(filename3)+'\n\n') else: f.write('### '+str(filename3)+'\n\n') filenames4 = os.listdir(filename3_with_path) for filename4 in filenames4[::reverse_positive_or_negative]: filename4_with_path = os.path.join(directory, filename1, filename2, filename3, filename4) if os.path.isfile(filename4_with_path): if os.path.splitext(filename4)[1] not in banned_file_format: if hide_file_format == None: f.write('+ '+str(filename4)+'\n\n') else: f.write('+ '+str(os.path.splitext(filename4)[0])+'\n\n') else: if starting_from_h1 == None: f.write('#') f.write('#### '+str(filename4)+'\n\n') filenames5 = os.listdir(filename4_with_path) for filename5 in filenames5[::reverse_positive_or_negative]: filename5_with_path = os.path.join(directory, filename1, filename2, filename3, filename4, filename5) if os.path.isfile(filename5_with_path): if os.path.splitext(filename5)[1] not in banned_file_format: if hide_file_format == None: f.write('+ '+str(filename5)+'\n\n') else: f.write('+ '+str(os.path.splitext(filename5)[0])+'\n\n') else: if starting_from_h1 == None: f.write('#') f.write('##### '+str(filename5)+'\n\n') filenames6 = os.listdir(filename5_with_path) for filename6 in filenames6[::reverse_positive_or_negative]: filename6_with_path = os.path.join(directory, filename1, filename2, filename3, filename4, filename5, filename6) if os.path.isfile(filename6_with_path): if os.path.splitext(filename6)[1] not in banned_file_format: if hide_file_format == None: f.write('+ '+str(filename6)+'\n\n') else: f.write('+ '+str(os.path.splitext(filename6)[0])+'\n\n') else: if starting_from_h1 == None: f.write('#') f.write('###### '+str(filename6)+'\n\n') f.close() # 从网页的标签中获取内容 def get_html_from_tags(link, tags=['title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'a']): from bs4 import BeautifulSoup import urllib.request import ssl ssl._create_default_https_context = ssl._create_unverified_context html = urllib.request.urlopen(link).read().decode('utf-8') soup = BeautifulSoup(html, features="lxml") all_tags = soup.find_all(tags) content = '' for tag in all_tags: text = tag.get_text().replace('\n', '') if content == '': content = text else: content = content + '\n\n' + text return content # 从HTML中获取所有的链接 def get_links_from_html(html_link, links_with_text=0): from bs4 import BeautifulSoup import urllib.request import ssl ssl._create_default_https_context = ssl._create_unverified_context html = urllib.request.urlopen(html_link).read().decode('utf-8') soup = BeautifulSoup(html, features="lxml") a_tags = soup.find_all('a') if links_with_text == 0: link_array = [tag.get('href') for tag in a_tags if tag.get('href')] return link_array else: link_array_with_text = [(tag.get('href'), tag.text) for tag in a_tags if tag.get('href')] return link_array_with_text # 检查链接的有效性 def check_link(url, timeout=3, allow_redirects=True): import requests try: response = requests.head(url, timeout=timeout, allow_redirects=allow_redirects) if response.status_code == 200: return True else: return False except requests.exceptions.RequestException: return False # 检查链接数组中链接的有效性 def check_link_array(link_array, timeout=3, allow_redirects=True, try_again=0, print_show=1): import guan failed_link_array0 = [] for link in link_array: if link=='#' or guan.check_link(link, timeout=timeout, allow_redirects=allow_redirects): pass else: failed_link_array0.append(link) if print_show: print(link) failed_link_array = [] if try_again: if print_show: print('\nTry again:\n') for link in failed_link_array0: if link=='#' or guan.check_link(link, timeout=timeout, allow_redirects=allow_redirects): pass else: failed_link_array.append(link) if print_show: print(link) else: failed_link_array = failed_link_array0 return failed_link_array # 生成二维码 def creat_qrcode(data="https://www.guanjihuan.com", filename='a', file_format='.png'): import qrcode img = qrcode.make(data) img.save(filename+file_format) # 将PDF文件转成文本 def pdf_to_text(pdf_path): from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBox from pdfminer.pdfinterp import PDFTextExtractionNotAllowed import logging logging.Logger.propagate = False logging.getLogger().setLevel(logging.ERROR) praser = PDFParser(open(pdf_path, 'rb')) doc = PDFDocument() praser.set_document(doc) doc.set_parser(praser) doc.initialize() if not doc.is_extractable: raise PDFTextExtractionNotAllowed else: rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) content = '' for page in doc.get_pages(): interpreter.process_page(page) layout = device.get_result() for x in layout: if isinstance(x, LTTextBox): content = content + x.get_text().strip() return content # 获取PDF文件页数 def get_pdf_page_number(pdf_path): import PyPDF2 pdf_file = open(pdf_path, 'rb') pdf_reader = PyPDF2.PdfReader(pdf_file) num_pages = len(pdf_reader.pages) return num_pages # 获取PDF文件指定页面的内容 def pdf_to_txt_for_a_specific_page(pdf_path, page_num=1): import PyPDF2 pdf_file = open(pdf_path, 'rb') pdf_reader = PyPDF2.PdfReader(pdf_file) num_pages = len(pdf_reader.pages) for page_num0 in range(num_pages): if page_num0 == page_num-1: page = pdf_reader.pages[page_num0] page_text = page.extract_text() pdf_file.close() return page_text # 获取PDF文献中的链接。例如: link_starting_form='https://doi.org' def get_links_from_pdf(pdf_path, link_starting_form=''): import PyPDF2 import re reader = PyPDF2.PdfReader(pdf_path) pages = len(reader.pages) i0 = 0 links = [] for page in range(pages): pageSliced = reader.pages[page] pageObject = pageSliced.get_object() if '/Annots' in pageObject.keys(): ann = pageObject['/Annots'] old = '' for a in ann: u = a.get_object() if '/A' in u.keys(): if '/URI' in u['/A']: if re.search(re.compile('^'+link_starting_form), u['/A']['/URI']): if u['/A']['/URI'] != old: links.append(u['/A']['/URI']) i0 += 1 old = u['/A']['/URI'] return links # 通过Sci-Hub网站下载文献 def download_with_scihub(address=None, num=1): from bs4 import BeautifulSoup import re import requests import os if num==1 and address!=None: address_array = [address] else: address_array = [] for i in range(num): address = input('\nInput:') address_array.append(address) for address in address_array: r = requests.post('https://sci-hub.st/', data={'request': address}) print('\nResponse:', r) print('Address:', r.url) soup = BeautifulSoup(r.text, features='lxml') pdf_URL = soup.embed['src'] # pdf_URL = soup.iframe['src'] # This is a code line of history version which fails to get pdf URL. if re.search(re.compile('^https:'), pdf_URL): pass else: pdf_URL = 'https:'+pdf_URL print('PDF address:', pdf_URL) name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::] print('PDF name:', name) print('Directory:', os.getcwd()) print('\nDownloading...') r = requests.get(pdf_URL, stream=True) with open(name, 'wb') as f: for chunk in r.iter_content(chunk_size=32): f.write(chunk) print('Completed!\n') if num != 1: print('All completed!\n') # 将字符串转成音频 def str_to_audio(str='hello world', filename='str', rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0): import pyttsx3 import guan if print_text==1: print(str) engine = pyttsx3.init() voices = engine.getProperty('voices') engine.setProperty('voice', voices[voice].id) engine.setProperty("rate", rate) if save==1: engine.save_to_file(str, filename+'.wav') engine.runAndWait() print('Wav file saved!') if compress==1: import os os.rename(filename+'.wav', 'temp.wav') guan.compress_wav_to_mp3('temp.wav', output_filename=filename+'.mp3', bitrate=bitrate) os.remove('temp.wav') if read==1: engine.say(str) engine.runAndWait() # 将txt文件转成音频 def txt_to_audio(txt_path, rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0): import pyttsx3 import guan f = open(txt_path, 'r', encoding ='utf-8') text = f.read() if print_text==1: print(text) engine = pyttsx3.init() voices = engine.getProperty('voices') engine.setProperty('voice', voices[voice].id) engine.setProperty("rate", rate) if save==1: import re filename = re.split('[/,\\\]', txt_path)[-1][:-4] engine.save_to_file(text, filename+'.wav') engine.runAndWait() print('Wav file saved!') if compress==1: import os os.rename(filename+'.wav', 'temp.wav') guan.compress_wav_to_mp3('temp.wav', output_filename=filename+'.mp3', bitrate=bitrate) os.remove('temp.wav') if read==1: engine.say(text) engine.runAndWait() # 将PDF文件转成音频 def pdf_to_audio(pdf_path, rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0): import pyttsx3 import guan text = guan.pdf_to_text(pdf_path) text = text.replace('\n', ' ') if print_text==1: print(text) engine = pyttsx3.init() voices = engine.getProperty('voices') engine.setProperty('voice', voices[voice].id) engine.setProperty("rate", rate) if save==1: import re filename = re.split('[/,\\\]', pdf_path)[-1][:-4] engine.save_to_file(text, filename+'.wav') engine.runAndWait() print('Wav file saved!') if compress==1: import os os.rename(filename+'.wav', 'temp.wav') guan.compress_wav_to_mp3('temp.wav', output_filename=filename+'.mp3', bitrate=bitrate) os.remove('temp.wav') if read==1: engine.say(text) engine.runAndWait() # 将wav音频文件压缩成MP3音频文件 def compress_wav_to_mp3(wav_path, output_filename='a.mp3', bitrate='16k'): # Note: Beside the installation of pydub, you may also need download FFmpeg on http://www.ffmpeg.org/download.html and add the bin path to the environment variable. from pydub import AudioSegment sound = AudioSegment.from_mp3(wav_path) sound.export(output_filename,format="mp3",bitrate=bitrate) # 将WordPress导出的XML格式文件转换成多个MarkDown格式的文件 def convert_wordpress_xml_to_markdown(xml_file='./a.xml', convert_content=1, replace_more=[]): import xml.etree.ElementTree as ET import re tree = ET.parse(xml_file) root = tree.getroot() for item in root.findall('.//item'): title = item.find('title').text content = item.find('.//content:encoded', namespaces={'content': 'http://purl.org/rss/1.0/modules/content/'}).text if convert_content == 1: content = re.sub(r'', '', content) content = content.replace('
', '') content = content.replace('
', '') content = content.replace('