diff --git a/API_Reference/API_Reference.py b/API_Reference/API_Reference.py index eb11889..0a645f4 100644 --- a/API_Reference/API_Reference.py +++ b/API_Reference/API_Reference.py @@ -1,4 +1,4 @@ -# API Reference shows all functions in Guan package. The current version is guan-0.0.193, updated on December 12, 2023. +# API Reference shows all functions in Guan package. The current version is guan-0.0.194, updated on December 23, 2023. import guan @@ -790,6 +790,70 @@ guan.print_array_with_index(array, show_index=1, index_type=0) +# Module 12: file processing + +# 自动先后运行程序(串行) +guan.run_programs_sequentially(program_files=['./a.py', './b.py'], execute='python ', show_time=0) + +# 如果不存在文件夹,则新建文件夹 +guan.make_directory(directory='./test') + +# 复制一份文件 +guan.copy_file(file1='./a.txt', file2='./b.txt') + +# 拼接两个PDF文件 +guan.combine_two_pdf_files(input_file_1='a.pdf', input_file_2='b.pdf', output_file='combined_file.pdf') + +# 将PDF文件转成文本 +content = guan.pdf_to_text(pdf_path) + +# 获取PDF文件页数 +num_pages = guan.get_pdf_page_number(pdf_path) + +# 获取PDF文件指定页面的内容 +page_text = guan.pdf_to_txt_for_a_specific_page(pdf_path, page_num=1) + +# 获取PDF文献中的链接。例如: link_starting_form='https://doi.org' +links = guan.get_links_from_pdf(pdf_path, link_starting_form='') + +# 通过Sci-Hub网站下载文献 +guan.download_with_scihub(address=None, num=1) + +# 将文件目录结构写入Markdown文件 +guan.write_file_list_in_markdown(directory='./', filename='a', reverse_positive_or_negative=1, starting_from_h1=None, banned_file_format=[], hide_file_format=None, divided_line=None, show_second_number=None, show_third_number=None) + +# 查找文件名相同的文件 +repeated_file = guan.find_repeated_file_with_same_filename(directory='./', ignored_directory_with_words=[], ignored_file_with_words=[], num=1000) + +# 统计各个子文件夹中的文件数量 +guan.count_file_in_sub_directory(directory='./', smaller_than_num=None) + +# 产生必要的文件,例如readme.md +guan.creat_necessary_file(directory, filename='readme', file_format='.md', content='', overwrite=None, ignored_directory_with_words=[]) + +# 删除特定文件名的文件 +guan.delete_file_with_specific_name(directory, filename='readme', file_format='.md') + +# 所有文件移到根目录(慎用) +guan.move_all_files_to_root_directory(directory) + +# 改变当前的目录位置 +guan.change_directory_by_replacement(current_key_word='code', new_key_word='data') + +# 生成二维码 +guan.creat_qrcode(data="https://www.guanjihuan.com", filename='a', file_format='.png') + +# 将文本转成音频 +guan.str_to_audio(str='hello world', filename='str', rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0) + +# 将txt文件转成音频 +guan.txt_to_audio(txt_path, rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0) + +# 将PDF文件转成音频 +guan.pdf_to_audio(pdf_path, rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0) + +# 将wav音频文件压缩成MP3音频文件 +guan.compress_wav_to_mp3(wav_path, output_filename='a.mp3', bitrate='16k') @@ -804,7 +868,25 @@ guan.print_array_with_index(array, show_index=1, index_type=0) -# Module 12: data processing + + + + + + + + + + + + + + + + + + +# Module 13: data processing # 并行计算前的预处理,把参数分成多份 parameter_array = guan.preprocess_for_parallel_calculations(parameter_array_all, cpus=1, task_index=0) @@ -896,92 +978,18 @@ guan.play_selected_academic_words(reverse=0, random_on=0, bre_or_ame='ame', show # 播放元素周期表上的单词 guan.play_element_words(random_on=0, show_translation=1, show_link=1, translation_time=2, rest_time=1) +# 获取当前函数名 +current_function_name = guan.get_current_function_name() + +# 获取调用本函数的函数名 +calling_function_name = guan.get_calling_function_name(layer=1) + +# 获取Guan软件包当前模块的所有函数名 +function_names = guan.get_all_function_names_in_current_module() + +# 统计Guan软件包中的函数数量 +num_functions = guan.count_functions_in_current_module() - - - - - - - - - - - - - - - - - - - - - -# Module 13: file processing - -# 自动先后运行程序(串行) -guan.run_programs_sequentially(program_files=['./a.py', './b.py'], execute='python ', show_time=0) - -# 如果不存在文件夹,则新建文件夹 -guan.make_directory(directory='./test') - -# 复制一份文件 -guan.copy_file(file1='./a.txt', file2='./b.txt') - -# 拼接两个PDF文件 -guan.combine_two_pdf_files(input_file_1='a.pdf', input_file_2='b.pdf', output_file='combined_file.pdf') - -# 将PDF文件转成文本 -content = guan.pdf_to_text(pdf_path) - -# 获取PDF文件页数 -num_pages = guan.get_pdf_page_number(pdf_path) - -# 获取PDF文件指定页面的内容 -page_text = guan.pdf_to_txt_for_a_specific_page(pdf_path, page_num=1) - -# 获取PDF文献中的链接。例如: link_starting_form='https://doi.org' -links = guan.get_links_from_pdf(pdf_path, link_starting_form='') - -# 通过Sci-Hub网站下载文献 -guan.download_with_scihub(address=None, num=1) - -# 将文件目录结构写入Markdown文件 -guan.write_file_list_in_markdown(directory='./', filename='a', reverse_positive_or_negative=1, starting_from_h1=None, banned_file_format=[], hide_file_format=None, divided_line=None, show_second_number=None, show_third_number=None) - -# 查找文件名相同的文件 -repeated_file = guan.find_repeated_file_with_same_filename(directory='./', ignored_directory_with_words=[], ignored_file_with_words=[], num=1000) - -# 统计各个子文件夹中的文件数量 -guan.count_file_in_sub_directory(directory='./', smaller_than_num=None) - -# 产生必要的文件,例如readme.md -guan.creat_necessary_file(directory, filename='readme', file_format='.md', content='', overwrite=None, ignored_directory_with_words=[]) - -# 删除特定文件名的文件 -guan.delete_file_with_specific_name(directory, filename='readme', file_format='.md') - -# 所有文件移到根目录(慎用) -guan.move_all_files_to_root_directory(directory) - -# 改变当前的目录位置 -guan.change_directory_by_replacement(current_key_word='code', new_key_word='data') - -# 生成二维码 -guan.creat_qrcode(data="https://www.guanjihuan.com", filename='a', file_format='.png') - -# 将文本转成音频 -guan.str_to_audio(str='hello world', filename='str', rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0) - -# 将txt文件转成音频 -guan.txt_to_audio(txt_path, rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0) - -# 将PDF文件转成音频 -guan.pdf_to_audio(pdf_path, rate=125, voice=1, read=1, save=0, compress=0, bitrate='16k', print_text=0) - -# 将wav音频文件压缩成MP3音频文件 -guan.compress_wav_to_mp3(wav_path, output_filename='a.mp3', bitrate='16k') diff --git a/Source_Code/PyPI/setup.cfg b/Source_Code/PyPI/setup.cfg index eb3ca4e..b63d48e 100644 --- a/Source_Code/PyPI/setup.cfg +++ b/Source_Code/PyPI/setup.cfg @@ -1,7 +1,7 @@ [metadata] # replace with your username: name = guan -version = 0.0.193 +version = 0.1.0 author = guanjihuan author_email = guanjihuan@163.com description = An open source python package diff --git a/Source_Code/PyPI/src/guan.egg-info/PKG-INFO b/Source_Code/PyPI/src/guan.egg-info/PKG-INFO index d58ff58..7af2a9e 100644 --- a/Source_Code/PyPI/src/guan.egg-info/PKG-INFO +++ b/Source_Code/PyPI/src/guan.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: guan -Version: 0.0.193 +Version: 0.1.0 Summary: An open source python package Home-page: https://py.guanjihuan.com Author: guanjihuan diff --git a/Source_Code/PyPI/src/guan/__init__.py b/Source_Code/PyPI/src/guan/__init__.py index 571d86c..ebaa366 100644 --- a/Source_Code/PyPI/src/guan/__init__.py +++ b/Source_Code/PyPI/src/guan/__init__.py @@ -1,6 +1,6 @@ # Guan is an open-source python package developed and maintained by https://www.guanjihuan.com/about (Ji-Huan Guan, 关济寰). The primary location of this package is on website https://py.guanjihuan.com. GitHub link: https://github.com/guanjihuan/py.guanjihuan.com. -# The current version is guan-0.0.193, updated on December 12, 2023. +# The current version is guan-0.1.0, updated on December 23, 2023. # Installation: pip install --upgrade guan @@ -17,8 +17,8 @@ # # Module 9: topological invariant # # Module 10: plot figures # # Module 11: read and write -# # Module 12: data processing -# # Module 13: file processing +# # Module 12: file processing +# # Module 13: data processing @@ -3525,7 +3525,533 @@ def print_array_with_index(array, show_index=1, index_type=0): -# Module 12: data processing + + + + + + + + + + + + + +# Module 12: file processing + +# 自动先后运行程序(串行) +def run_programs_sequentially(program_files=['./a.py', './b.py'], execute='python ', show_time=0): + import os + import time + if show_time == 1: + start = time.time() + i0 = 0 + for program_file in program_files: + i0 += 1 + if show_time == 1: + start_0 = time.time() + os.system(execute+program_file) + if show_time == 1: + end_0 = time.time() + print('Running time of program_'+str(i0)+' = '+str((end_0-start_0)/60)+' min') + if show_time == 1: + end = time.time() + print('Total running time = '+str((end-start)/60)+' min') + +# 如果不存在文件夹,则新建文件夹 +def make_directory(directory='./test'): + import os + if not os.path.exists(directory): + os.makedirs(directory) + +# 复制一份文件 +def copy_file(file1='./a.txt', file2='./b.txt'): + import shutil + shutil.copy(file1, file2) + +# 拼接两个PDF文件 +def combine_two_pdf_files(input_file_1='a.pdf', input_file_2='b.pdf', output_file='combined_file.pdf'): + import PyPDF2 + output_pdf = PyPDF2.PdfWriter() + with open(input_file_1, 'rb') as file1: + pdf1 = PyPDF2.PdfReader(file1) + for page in range(len(pdf1.pages)): + output_pdf.add_page(pdf1.pages[page]) + with open(input_file_2, 'rb') as file2: + pdf2 = PyPDF2.PdfReader(file2) + for page in range(len(pdf2.pages)): + output_pdf.add_page(pdf2.pages[page]) + with open(output_file, 'wb') as combined_file: + output_pdf.write(combined_file) + +# 将PDF文件转成文本 +def pdf_to_text(pdf_path): + from pdfminer.pdfparser import PDFParser, PDFDocument + from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter + from pdfminer.converter import PDFPageAggregator + from pdfminer.layout import LAParams, LTTextBox + from pdfminer.pdfinterp import PDFTextExtractionNotAllowed + import logging + logging.Logger.propagate = False + logging.getLogger().setLevel(logging.ERROR) + praser = PDFParser(open(pdf_path, 'rb')) + doc = PDFDocument() + praser.set_document(doc) + doc.set_parser(praser) + doc.initialize() + if not doc.is_extractable: + raise PDFTextExtractionNotAllowed + else: + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + content = '' + for page in doc.get_pages(): + interpreter.process_page(page) + layout = device.get_result() + for x in layout: + if isinstance(x, LTTextBox): + content = content + x.get_text().strip() + return content + +# 获取PDF文件页数 +def get_pdf_page_number(pdf_path): + import PyPDF2 + pdf_file = open(pdf_path, 'rb') + pdf_reader = PyPDF2.PdfReader(pdf_file) + num_pages = len(pdf_reader.pages) + return num_pages + +# 获取PDF文件指定页面的内容 +def pdf_to_txt_for_a_specific_page(pdf_path, page_num=1): + import PyPDF2 + pdf_file = open(pdf_path, 'rb') + pdf_reader = PyPDF2.PdfReader(pdf_file) + num_pages = len(pdf_reader.pages) + for page_num0 in range(num_pages): + if page_num0 == page_num-1: + page = pdf_reader.pages[page_num0] + page_text = page.extract_text() + pdf_file.close() + return page_text + +# 获取PDF文献中的链接。例如: link_starting_form='https://doi.org' +def get_links_from_pdf(pdf_path, link_starting_form=''): + import PyPDF2 + import re + pdfReader = PyPDF2.PdfFileReader(pdf_path) + pages = pdfReader.getNumPages() + i0 = 0 + links = [] + for page in range(pages): + pageSliced = pdfReader.getPage(page) + pageObject = pageSliced.getObject() + if '/Annots' in pageObject.keys(): + ann = pageObject['/Annots'] + old = '' + for a in ann: + u = a.getObject() + if '/A' in u.keys(): + if re.search(re.compile('^'+link_starting_form), u['/A']['/URI']): + if u['/A']['/URI'] != old: + links.append(u['/A']['/URI']) + i0 += 1 + old = u['/A']['/URI'] + return links + +# 通过Sci-Hub网站下载文献 +def download_with_scihub(address=None, num=1): + from bs4 import BeautifulSoup + import re + import requests + import os + if num==1 and address!=None: + address_array = [address] + else: + address_array = [] + for i in range(num): + address = input('\nInput:') + address_array.append(address) + for address in address_array: + r = requests.post('https://sci-hub.st/', data={'request': address}) + print('\nResponse:', r) + print('Address:', r.url) + soup = BeautifulSoup(r.text, features='lxml') + pdf_URL = soup.embed['src'] + # pdf_URL = soup.iframe['src'] # This is a code line of history version which fails to get pdf URL. + if re.search(re.compile('^https:'), pdf_URL): + pass + else: + pdf_URL = 'https:'+pdf_URL + print('PDF address:', pdf_URL) + name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::] + print('PDF name:', name) + print('Directory:', os.getcwd()) + print('\nDownloading...') + r = requests.get(pdf_URL, stream=True) + with open(name, 'wb') as f: + for chunk in r.iter_content(chunk_size=32): + f.write(chunk) + print('Completed!\n') + if num != 1: + print('All completed!\n') + +# 将文件目录结构写入Markdown文件 +def write_file_list_in_markdown(directory='./', filename='a', reverse_positive_or_negative=1, starting_from_h1=None, banned_file_format=[], hide_file_format=None, divided_line=None, show_second_number=None, show_third_number=None): + import os + f = open(filename+'.md', 'w', encoding="utf-8") + filenames1 = os.listdir(directory) + u0 = 0 + for filename1 in filenames1[::reverse_positive_or_negative]: + filename1_with_path = os.path.join(directory,filename1) + if os.path.isfile(filename1_with_path): + if os.path.splitext(filename1)[1] not in banned_file_format: + if hide_file_format == None: + f.write('+ '+str(filename1)+'\n\n') + else: + f.write('+ '+str(os.path.splitext(filename1)[0])+'\n\n') + else: + u0 += 1 + if divided_line != None and u0 != 1: + f.write('--------\n\n') + if starting_from_h1 == None: + f.write('#') + f.write('# '+str(filename1)+'\n\n') + + filenames2 = os.listdir(filename1_with_path) + i0 = 0 + for filename2 in filenames2[::reverse_positive_or_negative]: + filename2_with_path = os.path.join(directory, filename1, filename2) + if os.path.isfile(filename2_with_path): + if os.path.splitext(filename2)[1] not in banned_file_format: + if hide_file_format == None: + f.write('+ '+str(filename2)+'\n\n') + else: + f.write('+ '+str(os.path.splitext(filename2)[0])+'\n\n') + else: + i0 += 1 + if starting_from_h1 == None: + f.write('#') + if show_second_number != None: + f.write('## '+str(i0)+'. '+str(filename2)+'\n\n') + else: + f.write('## '+str(filename2)+'\n\n') + + j0 = 0 + filenames3 = os.listdir(filename2_with_path) + for filename3 in filenames3[::reverse_positive_or_negative]: + filename3_with_path = os.path.join(directory, filename1, filename2, filename3) + if os.path.isfile(filename3_with_path): + if os.path.splitext(filename3)[1] not in banned_file_format: + if hide_file_format == None: + f.write('+ '+str(filename3)+'\n\n') + else: + f.write('+ '+str(os.path.splitext(filename3)[0])+'\n\n') + else: + j0 += 1 + if starting_from_h1 == None: + f.write('#') + if show_third_number != None: + f.write('### ('+str(j0)+') '+str(filename3)+'\n\n') + else: + f.write('### '+str(filename3)+'\n\n') + + filenames4 = os.listdir(filename3_with_path) + for filename4 in filenames4[::reverse_positive_or_negative]: + filename4_with_path = os.path.join(directory, filename1, filename2, filename3, filename4) + if os.path.isfile(filename4_with_path): + if os.path.splitext(filename4)[1] not in banned_file_format: + if hide_file_format == None: + f.write('+ '+str(filename4)+'\n\n') + else: + f.write('+ '+str(os.path.splitext(filename4)[0])+'\n\n') + else: + if starting_from_h1 == None: + f.write('#') + f.write('#### '+str(filename4)+'\n\n') + + filenames5 = os.listdir(filename4_with_path) + for filename5 in filenames5[::reverse_positive_or_negative]: + filename5_with_path = os.path.join(directory, filename1, filename2, filename3, filename4, filename5) + if os.path.isfile(filename5_with_path): + if os.path.splitext(filename5)[1] not in banned_file_format: + if hide_file_format == None: + f.write('+ '+str(filename5)+'\n\n') + else: + f.write('+ '+str(os.path.splitext(filename5)[0])+'\n\n') + else: + if starting_from_h1 == None: + f.write('#') + f.write('##### '+str(filename5)+'\n\n') + + filenames6 = os.listdir(filename5_with_path) + for filename6 in filenames6[::reverse_positive_or_negative]: + filename6_with_path = os.path.join(directory, filename1, filename2, filename3, filename4, filename5, filename6) + if os.path.isfile(filename6_with_path): + if os.path.splitext(filename6)[1] not in banned_file_format: + if hide_file_format == None: + f.write('+ '+str(filename6)+'\n\n') + else: + f.write('+ '+str(os.path.splitext(filename6)[0])+'\n\n') + else: + if starting_from_h1 == None: + f.write('#') + f.write('###### '+str(filename6)+'\n\n') + f.close() + +# 查找文件名相同的文件 +def find_repeated_file_with_same_filename(directory='./', ignored_directory_with_words=[], ignored_file_with_words=[], num=1000): + import os + from collections import Counter + file_list = [] + for root, dirs, files in os.walk(directory): + for i0 in range(len(files)): + file_list.append(files[i0]) + for word in ignored_directory_with_words: + if word in root: + file_list.remove(files[i0]) + for word in ignored_file_with_words: + if word in files[i0]: + try: + file_list.remove(files[i0]) + except: + pass + count_file = Counter(file_list).most_common(num) + repeated_file = [] + for item in count_file: + if item[1]>1: + repeated_file.append(item) + return repeated_file + +# 统计各个子文件夹中的文件数量 +def count_file_in_sub_directory(directory='./', smaller_than_num=None): + import os + from collections import Counter + dirs_list = [] + for root, dirs, files in os.walk(directory): + if dirs != []: + for i0 in range(len(dirs)): + dirs_list.append(root+'/'+dirs[i0]) + for sub_dir in dirs_list: + file_list = [] + for root, dirs, files in os.walk(sub_dir): + for i0 in range(len(files)): + file_list.append(files[i0]) + count_file = len(file_list) + if smaller_than_num == None: + print(sub_dir) + print(count_file) + print() + else: + if count_file1: - repeated_file.append(item) - return repeated_file - -# 统计各个子文件夹中的文件数量 -def count_file_in_sub_directory(directory='./', smaller_than_num=None): - import os - from collections import Counter - dirs_list = [] - for root, dirs, files in os.walk(directory): - if dirs != []: - for i0 in range(len(dirs)): - dirs_list.append(root+'/'+dirs[i0]) - for sub_dir in dirs_list: - file_list = [] - for root, dirs, files in os.walk(sub_dir): - for i0 in range(len(files)): - file_list.append(files[i0]) - count_file = len(file_list) - if smaller_than_num == None: - print(sub_dir) - print(count_file) - print() - else: - if count_file