diff --git a/2019.12.03_create_GIF_with_python/create_GIF_with_python.py b/2019.12.03_create_GIF_with_python/create_GIF_with_python.py index 370e0cf..8eee0ef 100644 --- a/2019.12.03_create_GIF_with_python/create_GIF_with_python.py +++ b/2019.12.03_create_GIF_with_python/create_GIF_with_python.py @@ -1,7 +1,4 @@ import imageio -import numpy as np -import os -# os.chdir('D:/data') # 设置文件读取和保存的位置 images = [] for i in range(1000): diff --git a/2020.10.31_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py b/2020.10.31_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py deleted file mode 100644 index 2402cc2..0000000 --- a/2020.10.31_download_references_in_a_pdf_file_with_python/download_references_in_a_pdf_file_with_python.py +++ /dev/null @@ -1,78 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869 -""" - -import PyPDF2 -import os -import re -from bs4 import BeautifulSoup -from urllib.request import urlopen -import requests - - - -def main(): - os.chdir('D:/') # PDF文件存放的位置 - filename = input('输入PDF文件名:') - pdfFile = open(filename+'.pdf','rb') # 打开PDF文件 - links = all_links_in_pdf(pdfFile) # 获取PDF文件中的链接 - pdfFile.close() # 关闭PDF文件 - os.chdir('D:/Reference') # 设置参考文献保存的位置 - download(links) # 下载文献 - - - -def all_links_in_pdf(pdfFile): - pdfReader = PyPDF2.PdfFileReader(pdfFile) - pages = pdfReader.getNumPages() - i0 = 0 - links = [] - print() - for page in range(pages): - pageSliced = pdfReader.getPage(page) - pageObject = pageSliced.getObject() - if '/Annots' in pageObject.keys(): - ann = pageObject['/Annots'] - old = '' - for a in ann: - u = a.getObject() - if '/A' in u.keys(): - if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接 - if u['/A']['/URI'] != old: # 排除重复链接 - print(i0 , u['/A']['/URI']) - links.append(u['/A']['/URI']) # 链接存在link数组中 - i0 += 1 - old = u['/A']['/URI'] - return links - - - -def download(links): - for i0 in [0, 1, 3]: # 指定参考文献下载,如需全部下载用for i0 in range(links.shape[0]): - address = links[i0] - r = requests.post('https://sci-hub.st/', data={'request': address}) - print('\n响应结果是:', r) - print('访问的地址是:', r.url) - soup = BeautifulSoup(r.text, features='lxml') - pdf_URL = soup.embed['src'] - # pdf_URL = soup.iframe['src'] # This is a code line of history version which fails to get pdf URL. - if re.search(re.compile('^https:'), pdf_URL): - pass - else: - pdf_URL = 'https:'+pdf_URL - print('PDF的地址是:', pdf_URL) - name = re.search(re.compile('fdp.*?/'),pdf_URL[::-1]).group()[::-1][1::] - print('PDF文件名是:', name) - print('保存的位置在:', os.getcwd()) - print('\n正在下载第',i0,'篇') - r = requests.get(pdf_URL, stream=True) - with open(name, 'wb') as f: - for chunk in r.iter_content(chunk_size=32): - f.write(chunk) - print('第',i0,'篇下载完成!') - print('\n全部下载完成!') - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/2020.10.31_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py b/2020.10.31_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py deleted file mode 100644 index 94ca406..0000000 --- a/2020.10.31_download_references_in_a_pdf_file_with_python/get_links_from_a_pdf_file.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/6869 -""" - -import PyPDF2 -import os -import re - -os.chdir('D:/') # PDF文件存放的位置 -filename = input('输入PDF文件名:') -pdfFile = open(filename+'.pdf','rb') -pdfReader = PyPDF2.PdfFileReader(pdfFile) -pages = pdfReader.getNumPages() -i0 = 0 -for page in range(pages): - pageSliced = pdfReader.getPage(page) - pageObject = pageSliced.getObject() - if '/Annots' in pageObject.keys(): - ann = pageObject['/Annots'] - old = '' - for a in ann: - u = a.getObject() - if '/A' in u.keys(): - if re.search(re.compile('^https://doi.org'), u['/A']['/URI']): # 排除其他形式的链接 - if u['/A']['/URI'] != old: # 排除重复链接 - print(i0 , u['/A']['/URI']) - i0 += 1 - old = u['/A']['/URI'] -pdfFile.close() \ No newline at end of file diff --git a/2021.01.13_python_code_for_data_processing/python_code_for_data_processing.py b/2021.01.13_python_code_for_data_processing/python_code_for_data_processing.py deleted file mode 100644 index e8f5568..0000000 --- a/2021.01.13_python_code_for_data_processing/python_code_for_data_processing.py +++ /dev/null @@ -1,322 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/8734 - -函数调用目录: -1. x, y = read_one_dimensional_data(filename='a') -2. x, y, matrix = read_two_dimensional_data(filename='a') -3. write_one_dimensional_data(x, y, filename='a') -4. write_two_dimensional_data(x, y, matrix, filename='a') -5. plot(x, y, xlabel='x', ylabel='y', title='', filename='a') -6. plot_3d_surface(x, y, matrix, xlabel='x', ylabel='y', zlabel='z', title='', filename='a') -7. plot_contour(x, y, matrix, xlabel='x', ylabel='y', title='', filename='a') -8. plot_2d_scatter(x, y, value, xlabel='x', ylabel='y', title='', filename='a') -9. plot_3d_surface(x, y, z, value, xlabel='x', ylabel='y', zlabel='z', title='', filename='a') -10. creat_animation(image_names, duration_time=0.5, filename='a') -11. eigenvalue_array = calculate_eigenvalue_with_one_paramete(x, matrix) -12. eigenvalue_array = calculate_eigenvalue_with_two_parameters(x, y, matrix) - -函数对应的功能: -1. 读取filename.txt文件中的一维数据y(x) -2. 读取filename.txt文件中的二维数据matrix(x,y) -3. 把一维数据y(x)写入filename.txt文件 -4. 把二维数据matrix(x,y)写入filename.txt文件 -5. 画y(x)图,并保存到filename.jpg文件。具体画图格式可在函数中修改! -6. 画3d_surface图,并保存到filename.jpg文件。具体画图格式可在函数中修改! -7. 画contour图,并保存到filename.jpg文件。具体画图格式可在函数中修改! -8. 画2d_scatter图,并保存到filename.jpg文件。具体画图格式可在函数中修改! -9. 画3d_scatter图,并保存到filename.jpg文件。具体画图格式可在函数中修改! -10. 制作动画 -11. 在参数x下,计算matrix函数的本征值eigenvalue_array[:, index] -12. 在参数(x,y)下,计算matrix函数的本征值eigenvalue_array[:, :, index] -""" - - -import numpy as np -# import os -# os.chdir('D:/data') - - -def main(): - pass # 读取数据 + 数据处理 + 保存新数据 - - -# 1. 读取filename.txt文件中的一维数据y(x) -def read_one_dimensional_data(filename='a'): - f = open(filename+'.txt', 'r') - text = f.read() - f.close() - row_list = np.array(text.split('\n')) - dim_column = np.array(row_list[0].split()).shape[0] - x = np.array([]) - y = np.array([]) - for row in row_list: - column = np.array(row.split()) - if column.shape[0] != 0: - x = np.append(x, [float(column[0])], axis=0) - y_row = np.zeros(dim_column-1) - for dim0 in range(dim_column-1): - y_row[dim0] = float(column[dim0+1]) - if np.array(y).shape[0] == 0: - y = [y_row] - else: - y = np.append(y, [y_row], axis=0) - return x, y - - -# 2. 读取filename.txt文件中的二维数据matrix(x,y) -def read_two_dimensional_data(filename='a'): - f = open(filename+'.txt', 'r') - text = f.read() - f.close() - row_list = np.array(text.split('\n')) - dim_column = np.array(row_list[0].split()).shape[0] - x = np.array([]) - y = np.array([]) - matrix = np.array([]) - for i0 in range(row_list.shape[0]): - column = np.array(row_list[i0].split()) - if i0 == 0: - x_str = column[1::] - x = np.zeros(x_str.shape[0]) - for i00 in range(x_str.shape[0]): - x[i00] = float(x_str[i00]) - elif column.shape[0] != 0: - y = np.append(y, [float(column[0])], axis=0) - matrix_row = np.zeros(dim_column-1) - for dim0 in range(dim_column-1): - matrix_row[dim0] = float(column[dim0+1]) - if np.array(matrix).shape[0] == 0: - matrix = [matrix_row] - else: - matrix = np.append(matrix, [matrix_row], axis=0) - return x, y, matrix - - -# 3. 把一维数据y(x)写入filename.txt文件 -def write_one_dimensional_data(x, y, filename='a'): - with open(filename+'.txt', 'w') as f: - i0 = 0 - for x0 in x: - f.write(str(x0)+' ') - if len(y.shape) == 1: - f.write(str(y[i0])+'\n') - elif len(y.shape) == 2: - for j0 in range(y.shape[1]): - f.write(str(y[i0, j0])+' ') - f.write('\n') - i0 += 1 - - -# 4. 把二维数据matrix(x,y)写入filename.txt文件 -def write_two_dimensional_data(x, y, matrix, filename='a'): - with open(filename+'.txt', 'w') as f: - f.write('0 ') - for x0 in x: - f.write(str(x0)+' ') - f.write('\n') - i0 = 0 - for y0 in y: - f.write(str(y0)) - j0 = 0 - for x0 in x: - f.write(' '+str(matrix[i0, j0])+' ') - j0 += 1 - f.write('\n') - i0 += 1 - - -# 5. 画y(x)图,并保存到filename.jpg文件。具体画图格式可在函数中修改。 -def plot(x, y, xlabel='x', ylabel='y', title='', filename='a', show=1, save=0): - import matplotlib.pyplot as plt - fig, ax = plt.subplots() - plt.subplots_adjust(bottom=0.20, left=0.18) - ax.plot(x, y, '-o') - ax.grid() - ax.set_title(title, fontsize=20, fontfamily='Times New Roman') - ax.set_xlabel(xlabel, fontsize=20, fontfamily='Times New Roman') - ax.set_ylabel(ylabel, fontsize=20, fontfamily='Times New Roman') - ax.tick_params(labelsize=20) - labels = ax.get_xticklabels() + ax.get_yticklabels() - [label.set_fontname('Times New Roman') for label in labels] - if save == 1: - plt.savefig(filename+'.jpg', dpi=300) - if show == 1: - plt.show() - plt.close('all') - - - -# 6. 画3d_surface图,并保存到filename.jpg文件。具体画图格式可在函数中修改。 -def plot_3d_surface(x, y, matrix, xlabel='x', ylabel='y', zlabel='z', title='', filename='a', show=1, save=0): - import matplotlib.pyplot as plt - from matplotlib import cm - from matplotlib.ticker import LinearLocator - fig, ax = plt.subplots(subplot_kw={"projection": "3d"}) - plt.subplots_adjust(bottom=0.1, right=0.65) - x, y = np.meshgrid(x, y) - if len(matrix.shape) == 2: - surf = ax.plot_surface(x, y, matrix, cmap=cm.coolwarm, linewidth=0, antialiased=False) - elif len(matrix.shape) == 3: - for i0 in range(matrix.shape[2]): - surf = ax.plot_surface(x, y, matrix[:,:,i0], cmap=cm.coolwarm, linewidth=0, antialiased=False) - ax.set_title(title, fontsize=20, fontfamily='Times New Roman') - ax.set_xlabel(xlabel, fontsize=20, fontfamily='Times New Roman') - ax.set_ylabel(ylabel, fontsize=20, fontfamily='Times New Roman') - ax.set_zlabel(zlabel, fontsize=20, fontfamily='Times New Roman') - ax.zaxis.set_major_locator(LinearLocator(5)) - ax.zaxis.set_major_formatter('{x:.2f}') - ax.tick_params(labelsize=15) - labels = ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels() - [label.set_fontname('Times New Roman') for label in labels] - cax = plt.axes([0.80, 0.15, 0.05, 0.75]) - cbar = fig.colorbar(surf, cax=cax) - cbar.ax.tick_params(labelsize=15) - for l in cbar.ax.yaxis.get_ticklabels(): - l.set_family('Times New Roman') - if save == 1: - plt.savefig(filename+'.jpg', dpi=300) - if show == 1: - plt.show() - plt.close('all') - - - -# 7. 画plot_contour图,并保存到filename.jpg文件。具体画图格式可在函数中修改。 -def plot_contour(x, y, matrix, xlabel='x', ylabel='y', title='', filename='a', show=1, save=0): - import matplotlib.pyplot as plt - from matplotlib import cm - from matplotlib.ticker import LinearLocator - fig, ax = plt.subplots() - plt.subplots_adjust(bottom=0.2, right=0.75, left = 0.16) - x, y = np.meshgrid(x, y) - contour = ax.contourf(x,y,matrix,cmap='jet') - ax.set_title(title, fontsize=20, fontfamily='Times New Roman') - ax.set_xlabel(xlabel, fontsize=20, fontfamily='Times New Roman') - ax.set_ylabel(ylabel, fontsize=20, fontfamily='Times New Roman') - ax.tick_params(labelsize=15) - labels = ax.get_xticklabels() + ax.get_yticklabels() - [label.set_fontname('Times New Roman') for label in labels] - cax = plt.axes([0.78, 0.17, 0.08, 0.71]) - cbar = fig.colorbar(contour, cax=cax) - cbar.ax.tick_params(labelsize=15) - for l in cbar.ax.yaxis.get_ticklabels(): - l.set_family('Times New Roman') - if save == 1: - plt.savefig(filename+'.jpg', dpi=300) - if show == 1: - plt.show() - plt.close('all') - - -# 8. 画2d_scatter图,并保存到filename.jpg文件。具体画图格式可在函数中修改! -def plot_2d_scatter(x, y, value, xlabel='x', ylabel='y', title='', filename='a', show=1, save=0): - import matplotlib.pyplot as plt - from matplotlib.axes._axes import _log as matplotlib_axes_logger - matplotlib_axes_logger.setLevel('ERROR') - fig = plt.figure() - ax = fig.add_subplot(111) - plt.subplots_adjust(bottom=0.2, right=0.8, left=0.2) - for i in range(np.array(x).shape[0]): - ax.scatter(x[i], y[i], marker='o', s=100*value[i], c=(1,0,0)) - ax.set_title(title, fontsize=20, fontfamily='Times New Roman') - ax.set_xlabel(xlabel, fontsize=20, fontfamily='Times New Roman') - ax.set_ylabel(ylabel, fontsize=20, fontfamily='Times New Roman') - ax.tick_params(labelsize=15) - labels = ax.get_xticklabels() + ax.get_yticklabels() - [label.set_fontname('Times New Roman') for label in labels] - if save == 1: - plt.savefig(filename+'.jpg', dpi=300) - if show == 1: - plt.show() - plt.close('all') - - -# 9. 画3d_scatter图,并保存到filename.jpg文件。具体画图格式可在函数中修改! -def plot_3d_scatter(x, y, z, value, xlabel='x', ylabel='y', zlabel='z', title='', filename='a', show=1, save=0): - import matplotlib.pyplot as plt - from matplotlib.ticker import LinearLocator - from matplotlib.axes._axes import _log as matplotlib_axes_logger - matplotlib_axes_logger.setLevel('ERROR') - fig = plt.figure() - ax = fig.add_subplot(111, projection='3d') - plt.subplots_adjust(bottom=0.1, right=0.8) - for i in range(np.array(x).shape[0]): - ax.scatter(x[i], y[i], z[i], marker='o', s=int(100*value[i]), c=(1,0,0)) - ax.set_title(title, fontsize=20, fontfamily='Times New Roman') - ax.set_xlabel(xlabel, fontsize=20, fontfamily='Times New Roman') - ax.set_ylabel(ylabel, fontsize=20, fontfamily='Times New Roman') - ax.set_zlabel(zlabel, fontsize=20, fontfamily='Times New Roman') - ax.tick_params(labelsize=15) - labels = ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels() - [label.set_fontname('Times New Roman') for label in labels] - if save == 1: - plt.savefig(filename+'.jpg', dpi=300) - if show == 1: - plt.show() - plt.close('all') - - -# 10. 制作动画 -def creat_animation(image_names, duration_time=0.5, filename='a'): - import imageio - images = [] - for name in image_names: - image = name+'.jpg' - im = imageio.imread(image) - images.append(im) - imageio.mimsave(filename+'.gif', images, 'GIF', duration=duration_time) # durantion是延迟时间 - - -# 11. 在参数x下,计算matrix函数的本征值eigenvalue_array[:, index] -def calculate_eigenvalue_with_one_parameter(x, matrix): - dim_x = np.array(x).shape[0] - i0 = 0 - if np.array(matrix(0)).shape==(): - eigenvalue_array = np.zeros((dim_x, 1)) - for x0 in x: - matrix0 = matrix(x0) - eigenvalue_array[i0, 0] = np.real(matrix0) - i0 += 1 - else: - dim = np.array(matrix(0)).shape[0] - eigenvalue_array = np.zeros((dim_x, dim)) - for x0 in x: - matrix0 = matrix(x0) - eigenvalue, eigenvector = np.linalg.eig(matrix0) - eigenvalue_array[i0, :] = np.sort(np.real(eigenvalue[:])) - i0 += 1 - return eigenvalue_array - - -# 12. 在参数(x,y)下,计算matrix函数的本征值eigenvalue_array[:, :, index] -def calculate_eigenvalue_with_two_parameters(x, y, matrix): - dim_x = np.array(x).shape[0] - dim_y = np.array(y).shape[0] - if np.array(matrix(0,0)).shape==(): - eigenvalue_array = np.zeros((dim_y, dim_x, 1)) - i0 = 0 - for y0 in y: - j0 = 0 - for x0 in x: - matrix0 = matrix(x0, y0) - eigenvalue_array[i0, j0, 0] = np.real(matrix0) - j0 += 1 - i0 += 1 - else: - dim = np.array(matrix(0, 0)).shape[0] - eigenvalue_array = np.zeros((dim_y, dim_x, dim)) - i0 = 0 - for y0 in y: - j0 = 0 - for x0 in x: - matrix0 = matrix(x0, y0) - eigenvalue, eigenvector = np.linalg.eig(matrix0) - eigenvalue_array[i0, j0, :] = np.sort(np.real(eigenvalue[:])) - j0 += 1 - i0 += 1 - return eigenvalue_array - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/2021.01.23_find_key_words_in_pdf_files_with_pdfminer/find_key_words_in_pdf_files.py b/2021.01.23_find_key_words_in_pdf_files_with_pdfminer/find_key_words_in_pdf_files.py deleted file mode 100644 index affae8f..0000000 --- a/2021.01.23_find_key_words_in_pdf_files_with_pdfminer/find_key_words_in_pdf_files.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/9129 -""" - -import os -import re -import time -import logging -logging.Logger.propagate = False -logging.getLogger().setLevel(logging.ERROR) # 只显示error级别的通知 - - -def main(): - # 参数 - key_word_array = ['photonic', 'Berry phase'] - original_path = 'D:\\文献' - - # 查找所有的PDF文件路径 - pdf_file_all = find_files_pdf(original_path) - print('\n该文件夹下总共有', len(pdf_file_all), '个PDF文件。\n') - - f = open('error.txt','w',encoding='utf-8') - f.close() - for key_word in key_word_array: - f = open(str(key_word)+'.txt','w',encoding='utf-8') - f.write('该文件夹下总共有'+str(len(pdf_file_all))+'个PDF文件。\n') - f.close() - - # 查找包含关键词的PDF文件 - i0 = 1 - begin = time.time() - for pdf_file in pdf_file_all: - print('查找第', i0, '个文件,', end='') - begin0 = time.time() - try: - content = get_text_from_pdf(pdf_file) - for key_word in key_word_array: - if re.search(re.compile(key_word),content): - print('发现文件!关键词', key_word, '对应的文件位置在:\n\n', pdf_file, '\n') - with open(str(key_word)+'.txt','a',encoding='utf-8') as f: - f.write('\n查找第'+str(i0)+'个文件时发现文件!位置在:\n'+pdf_file+'\n') - except: - print('出现异常!位置在:\n\n', pdf_file, '\n') - with open('error.txt','a',encoding='utf-8') as f: - f.write('\n解析第'+str(i0)+'个文件时出现异常!位置在:\n'+pdf_file+'\n') - end0 = time.time() - print('用时', end0-begin0, '秒') - i0 += 1 - print('\n全部搜索结束!') - end = time.time() - print('\n总共用时:', (end-begin)/60, '分') - - -def find_files_pdf(path): # 查找所有PDF文件 - file_all = find_files(path) - pdf_file_all = [] - for file0 in file_all: - if re.search(re.compile('^fdp.'),file0[::-1]): # 如果文件是以.pdf结尾 - pdf_file_all.append(file0) - return pdf_file_all - - -def find_files(path): # 查找所有文件 - file_all = [] - path_next_loop = [path] - for i in range(10000): # i为文件在文件夹中的深度 - file_all_in_one_loop, path_next_loop = find_files_loop_module(path_next_loop) - for file_in_one_loop in file_all_in_one_loop: - file_all.append(file_in_one_loop) - if path_next_loop == []: - break - return file_all - - -def find_files_loop_module(path_all): # 查找文件的一个循环模块 - file_all_in_one_loop = [] - path_next_loop = [] - for path in path_all: - filenames = os.listdir(path) - for filename in filenames: - filename = os.path.join(path,filename) - if os.path.isfile(filename): # 如果是文件 - file_all_in_one_loop.append(filename) - else: # 如果是文件夹 - path_next_loop.append(filename) - return file_all_in_one_loop, path_next_loop - - -def get_text_from_pdf(file_path): # 从PDF中获取文本 - from pdfminer.pdfparser import PDFParser, PDFDocument - from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter - from pdfminer.converter import PDFPageAggregator - from pdfminer.layout import LAParams, LTTextBox - from pdfminer.pdfinterp import PDFTextExtractionNotAllowed - - # 用文件对象来创建一个pdf文档分析器 - praser = PDFParser(open(file_path, 'rb')) - # 创建一个PDF文档 - doc = PDFDocument() - # 连接分析器 与文档对象 - praser.set_document(doc) - doc.set_parser(praser) - - # 提供初始化密码 - # 如果没有密码 就创建一个空的字符串 - doc.initialize() - - # 检测文档是否提供txt转换,不提供就忽略 - if not doc.is_extractable: - raise PDFTextExtractionNotAllowed - else: - # 创建PDf 资源管理器 来管理共享资源 - rsrcmgr = PDFResourceManager() - # 创建一个PDF设备对象 - laparams = LAParams() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - # 创建一个PDF解释器对象 - interpreter = PDFPageInterpreter(rsrcmgr, device) - - # 循环遍历列表,每次处理一个page的内容 - content = '' - for page in doc.get_pages(): - interpreter.process_page(page) - # 接受该页面的LTPage对象 - layout = device.get_result() - # 这里layout是一个LTPage对象,里面存放着这个 page 解析出的各种对象 - # 包括 LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等 - for x in layout: - if isinstance(x, LTTextBox): - # print(x.get_text().strip()) - content = content + x.get_text().strip() - return content - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/2021.01.23_find_key_words_in_pdf_files_with_pdfminer/get_content_in_a_pdf_file.py b/2021.01.23_find_key_words_in_pdf_files_with_pdfminer/get_content_in_a_pdf_file.py deleted file mode 100644 index 5ff8ad4..0000000 --- a/2021.01.23_find_key_words_in_pdf_files_with_pdfminer/get_content_in_a_pdf_file.py +++ /dev/null @@ -1,63 +0,0 @@ -import os -os.chdir('D:/') # PDF文件存放的位置 -import logging -logging.Logger.propagate = False -logging.getLogger().setLevel(logging.ERROR) # 只显示error级别的通知 - - -def main(): - content = get_text_from_pdf('a') - with open('a.txt', 'w', encoding='utf-8') as f: - f.write(content) - - -def get_text_from_pdf(filename): - from pdfminer.pdfparser import PDFParser, PDFDocument - from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter - from pdfminer.converter import PDFPageAggregator - from pdfminer.layout import LAParams, LTTextBox - from pdfminer.pdfinterp import PDFTextExtractionNotAllowed - - path = filename+".pdf" - - # 用文件对象来创建一个pdf文档分析器 - praser = PDFParser(open(path, 'rb')) - # 创建一个PDF文档 - doc = PDFDocument() - # 连接分析器 与文档对象 - praser.set_document(doc) - doc.set_parser(praser) - - # 提供初始化密码 - # 如果没有密码 就创建一个空的字符串 - doc.initialize() - - # 检测文档是否提供txt转换,不提供就忽略 - if not doc.is_extractable: - raise PDFTextExtractionNotAllowed - else: - # 创建PDf 资源管理器 来管理共享资源 - rsrcmgr = PDFResourceManager() - # 创建一个PDF设备对象 - laparams = LAParams() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - # 创建一个PDF解释器对象 - interpreter = PDFPageInterpreter(rsrcmgr, device) - - # 循环遍历列表,每次处理一个page的内容 - content = '' - for page in doc.get_pages(): - interpreter.process_page(page) - # 接受该页面的LTPage对象 - layout = device.get_result() - # 这里layout是一个LTPage对象,里面存放着这个 page 解析出的各种对象 - # 包括 LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等 - for x in layout: - if isinstance(x, LTTextBox): - # print(x.get_text().strip()) - content = content + x.get_text().strip() - return content - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/2022.02.16_change_directory_by_replacement/change_directory_by_replacement _with_guan.py b/2022.02.16_change_directory_by_replacement/change_directory_by_replacement _with_guan.py deleted file mode 100644 index b67002d..0000000 --- a/2022.02.16_change_directory_by_replacement/change_directory_by_replacement _with_guan.py +++ /dev/null @@ -1,4 +0,0 @@ -import guan -guan.change_directory_by_replacement(current_key_word='code', new_key_word='data') -with open('data.txt', 'w') as f: # 保存数据 - f.write('Hello world') \ No newline at end of file diff --git a/2022.02.16_change_directory_by_replacement/change_directory_by_replacement _with_guan_2.py b/2022.02.16_change_directory_by_replacement/change_directory_by_replacement _with_guan_2.py deleted file mode 100644 index 86c4547..0000000 --- a/2022.02.16_change_directory_by_replacement/change_directory_by_replacement _with_guan_2.py +++ /dev/null @@ -1,4 +0,0 @@ -import guan -guan.change_directory_by_replacement(current_key_word='working/code', new_key_word='local/data') -with open('data.txt', 'w') as f: # 保存数据 - f.write('Hello world') \ No newline at end of file diff --git a/2022.02.16_change_directory_by_replacement/change_directory_by_replacement.py b/2022.02.16_change_directory_by_replacement/change_directory_by_replacement.py deleted file mode 100644 index b2052a2..0000000 --- a/2022.02.16_change_directory_by_replacement/change_directory_by_replacement.py +++ /dev/null @@ -1,9 +0,0 @@ -import os -code_path = os.getcwd() # 当前代码文件的路径 -data_path = code_path.replace('\\', '/') # \改为/,防止路径报错 -data_path = data_path.replace('code', 'data') # 把路径中code改为data -if os.path.exists(data_path) == False: # 如果文件夹不存在,新建文件夹 - os.makedirs(data_path) -os.chdir(data_path) # 转到数据的存放路径 -with open('data.txt', 'w') as f: # 保存数据 - f.write('Hello world') \ No newline at end of file diff --git a/2022.08.31_batch_modify_file_name/batch_modify_file_name.py b/2022.08.31_batch_modify_file_name/batch_modify_file_name.py deleted file mode 100644 index 01656ac..0000000 --- a/2022.08.31_batch_modify_file_name/batch_modify_file_name.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/25453 -""" - -import os - -# 选取某个目录 -directory = 'E:/' - -def main(): - for root, dirs, files in os.walk(directory): - for i0 in range(len(files)): - if 'pdf' in files[i0] or 'djvu' in files[i0]: # 满足某个条件的文件 - - # 显示旧文件名 - name = files[i0] - print(name) # 显示旧文件名 - - # 显示新文件名 - new_name = modify_name(name) - print(new_name) - print() - - # # 修改文件名。注意:需要检查前面的代码,尤其是modify_name的规则看是否都满足,再运行下面的代码,否则文件名的修改会出现遗漏或混乱。 - # if new_name != None: - # os.rename(root+'/'+name, root+'/'+new_name) - - -def modify_name(name): # 按某种规则修改文件名 - array = name.split(' - ') # 通过' - '把这类型的文件名切开 - if len(array) != 3: - print('Miss:', name) - new_name = None # 如果不满足规则,则不修改 - else: - new_name= array[1]+' - '+array[0]+' - '+array[2] # 做个对调 - return new_name - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/2022.09.07_move_all_files_to_root_directory/move_all_files_to_root_directory.py b/2022.09.07_move_all_files_to_root_directory/move_all_files_to_root_directory.py deleted file mode 100644 index 04dab56..0000000 --- a/2022.09.07_move_all_files_to_root_directory/move_all_files_to_root_directory.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/25685 -""" - -# 注意:这个程序请小心使用,防止误操作把系统文件或个人文件破坏。在选取好directory目录后,请经过再三确认无误后再运行,尤其是directory的层级不能太高。 - - -def main(): - # 选取某个目录 - directory = 'E:/test/all_files' - move_all_files_to_root_directory(directory) - - # import guan - # guan.move_all_files_to_root_directory(directory) - - -def move_all_files_to_root_directory(directory): - import os - import shutil - for root, dirs, files in os.walk(directory): - for i0 in range(len(files)): - # print(root) # 文件对应目录 - # print(files[i0], '\n') # 文件 - shutil.move(root+'/'+files[i0], directory+'/'+files[i0]) # 移动所有文件至根目录 - for i0 in range(100): # 多次尝试删除层数比较多的空文件夹,例如100层 - for root, dirs, files in os.walk(directory): - try: - os.rmdir(root) # 删除空文件夹 - except: - pass - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/2022.09.08_get_file_list_and_write_in_markdown/get_file_list_and_write_in_markdown.py b/2022.09.08_get_file_list_and_write_in_markdown/get_file_list_and_write_in_markdown.py deleted file mode 100644 index d1e318a..0000000 --- a/2022.09.08_get_file_list_and_write_in_markdown/get_file_list_and_write_in_markdown.py +++ /dev/null @@ -1,116 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/25699 -""" - - -def main(): - directory = 'E:/literature' - write_file_list_in_markdown(directory) - - -def write_file_list_in_markdown(directory='./', filename='a', reverse_positive_or_negative=1, starting_from_h1=None, banned_file_format=[], hide_file_format=None, divided_line=None, show_second_number=None, show_third_number=None): - import os - f = open(filename+'.md', 'w', encoding="utf-8") - filenames1 = os.listdir(directory) - u0 = 0 - for filename1 in filenames1[::reverse_positive_or_negative]: - filename1_with_path = os.path.join(directory,filename1) - if os.path.isfile(filename1_with_path): # 文件 - if os.path.splitext(filename1)[1] not in banned_file_format: - if hide_file_format == None: - f.write('+ '+str(filename1)+'\n\n') - else: - f.write('+ '+str(os.path.splitext(filename1)[0])+'\n\n') - else: # 文件夹 - u0 += 1 - if divided_line != None and u0 != 1: - f.write('--------\n\n') - if starting_from_h1 == None: - f.write('#') - f.write('# '+str(filename1)+'\n\n') - - filenames2 = os.listdir(filename1_with_path) - i0 = 0 - for filename2 in filenames2[::reverse_positive_or_negative]: - filename2_with_path = os.path.join(directory, filename1, filename2) - if os.path.isfile(filename2_with_path): # 文件 - if os.path.splitext(filename2)[1] not in banned_file_format: - if hide_file_format == None: - f.write('+ '+str(filename2)+'\n\n') - else: - f.write('+ '+str(os.path.splitext(filename2)[0])+'\n\n') - else: # 文件夹 - i0 += 1 - if starting_from_h1 == None: - f.write('#') - if show_second_number != None: - f.write('## '+str(i0)+'. '+str(filename2)+'\n\n') - else: - f.write('## '+str(filename2)+'\n\n') - - j0 = 0 - filenames3 = os.listdir(filename2_with_path) - for filename3 in filenames3[::reverse_positive_or_negative]: - filename3_with_path = os.path.join(directory, filename1, filename2, filename3) - if os.path.isfile(filename3_with_path): # 文件 - if os.path.splitext(filename3)[1] not in banned_file_format: - if hide_file_format == None: - f.write('+ '+str(filename3)+'\n\n') - else: - f.write('+ '+str(os.path.splitext(filename3)[0])+'\n\n') - else: # 文件夹 - j0 += 1 - if starting_from_h1 == None: - f.write('#') - if show_third_number != None: - f.write('### ('+str(j0)+') '+str(filename3)+'\n\n') - else: - f.write('### '+str(filename3)+'\n\n') - - filenames4 = os.listdir(filename3_with_path) - for filename4 in filenames4[::reverse_positive_or_negative]: - filename4_with_path = os.path.join(directory, filename1, filename2, filename3, filename4) - if os.path.isfile(filename4_with_path): # 文件 - if os.path.splitext(filename4)[1] not in banned_file_format: - if hide_file_format == None: - f.write('+ '+str(filename4)+'\n\n') - else: - f.write('+ '+str(os.path.splitext(filename4)[0])+'\n\n') - else: # 文件夹 - if starting_from_h1 == None: - f.write('#') - f.write('#### '+str(filename4)+'\n\n') - - filenames5 = os.listdir(filename4_with_path) - for filename5 in filenames5[::reverse_positive_or_negative]: - filename5_with_path = os.path.join(directory, filename1, filename2, filename3, filename4, filename5) - if os.path.isfile(filename5_with_path): # 文件 - if os.path.splitext(filename5)[1] not in banned_file_format: - if hide_file_format == None: - f.write('+ '+str(filename5)+'\n\n') - else: - f.write('+ '+str(os.path.splitext(filename5)[0])+'\n\n') - else: # 文件夹 - if starting_from_h1 == None: - f.write('#') - f.write('##### '+str(filename5)+'\n\n') - - filenames6 = os.listdir(filename5_with_path) - for filename6 in filenames6[::reverse_positive_or_negative]: - filename6_with_path = os.path.join(directory, filename1, filename2, filename3, filename4, filename5, filename6) - if os.path.isfile(filename6_with_path): # 文件 - if os.path.splitext(filename6)[1] not in banned_file_format: - if hide_file_format == None: - f.write('+ '+str(filename6)+'\n\n') - else: - f.write('+ '+str(os.path.splitext(filename6)[0])+'\n\n') - else: # 文件夹 - if starting_from_h1 == None: - f.write('#') - f.write('###### '+str(filename6)+'\n\n') - f.close() - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/2022.09.12_creat_necessary_file_or_delete_file_with_specific_name/creat_necessary_file_or_delete_file_with_specific_name.py b/2022.09.12_creat_necessary_file_or_delete_file_with_specific_name/creat_necessary_file_or_delete_file_with_specific_name.py deleted file mode 100644 index 48def34..0000000 --- a/2022.09.12_creat_necessary_file_or_delete_file_with_specific_name/creat_necessary_file_or_delete_file_with_specific_name.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/25943 -""" - - -def main(): - directory = 'E:/test' - creat_necessary_file(directory) - # delete_file_with_specific_name(directory) - - # import guan - # guan.creat_necessary_file(directory) - # guan.delete_file_with_specific_name(directory) - - -def creat_necessary_file(directory, filename='readme', file_format='.md', content='', overwrite=None, ignored_directory_with_words=[]): - import os - directory_with_file = [] - ignored_directory = [] - for root, dirs, files in os.walk(directory): - for i0 in range(len(files)): - if root not in directory_with_file: - directory_with_file.append(root) - if files[i0] == filename+file_format: - if root not in ignored_directory: - ignored_directory.append(root) - if overwrite == None: - for root in ignored_directory: - directory_with_file.remove(root) - ignored_directory_more =[] - for root in directory_with_file: - for word in ignored_directory_with_words: - if word in root: - if root not in ignored_directory_more: - ignored_directory_more.append(root) - for root in ignored_directory_more: - directory_with_file.remove(root) - for root in directory_with_file: - os.chdir(root) - f = open(filename+file_format, 'w', encoding="utf-8") - f.write(content) - f.close() - - -def delete_file_with_specific_name(directory, filename='readme', file_format='.md'): - import os - for root, dirs, files in os.walk(directory): - for i0 in range(len(files)): - if files[i0] == filename+file_format: - os.remove(root+'/'+files[i0]) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/2022.09.14_find_repeated_file_with_same_filename/find_repeated_file_with_same_filename.py b/2022.09.14_find_repeated_file_with_same_filename/find_repeated_file_with_same_filename.py deleted file mode 100644 index 576f0e2..0000000 --- a/2022.09.14_find_repeated_file_with_same_filename/find_repeated_file_with_same_filename.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/26113 -""" - -# 仅支持文件名判断是否重复,不支持对文件内容的判断。 -# 如需对文件名和内容都判断,需要计算文件的哈希值。这里暂时不考虑。 - - -def main(): - directory = 'E:/test' - repeated_file = find_repeated_file_with_same_filename(directory) - print(repeated_file) - - # import guan - # repeated_file = guan.find_repeated_file_with_same_filename(directory='./', ignored_directory_with_words=[], ignored_file_with_words=[], num=1000) - # print(repeated_file) - - -def find_repeated_file_with_same_filename(directory='./', ignored_directory_with_words=[], ignored_file_with_words=[], num=1000): - import os - from collections import Counter - file_list = [] - for root, dirs, files in os.walk(directory): - for i0 in range(len(files)): - file_list.append(files[i0]) - for word in ignored_directory_with_words: - if word in root: - file_list.remove(files[i0]) - for word in ignored_file_with_words: - if word in files[i0]: - try: - file_list.remove(files[i0]) - except: - pass - count_file = Counter(file_list).most_common(num) - repeated_file = [] - for item in count_file: - if item[1]>1: - repeated_file.append(item) - return repeated_file - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/2022.09.30_count_file_in_sub_directory/count_file_in_sub_directory.py b/2022.09.30_count_file_in_sub_directory/count_file_in_sub_directory.py deleted file mode 100644 index 0dedb1b..0000000 --- a/2022.09.30_count_file_in_sub_directory/count_file_in_sub_directory.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -This code is supported by the website: https://www.guanjihuan.com -The newest version of this code is on the web page: https://www.guanjihuan.com/archives/26536 -""" - - -def main(): - # 如果子文件夹中所有文件的数量小于5,输出路径。 - count_file_in_sub_directory(directory='./', smaller_than_num=5) - - # import guan - # guan.count_file_in_sub_directory(directory='./', smaller_than_num=5) - - -def count_file_in_sub_directory(directory='./', smaller_than_num=None): - import os - from collections import Counter - dirs_list = [] - for root, dirs, files in os.walk(directory): - if dirs != []: - for i0 in range(len(dirs)): - dirs_list.append(root+'/'+dirs[i0]) - for sub_dir in dirs_list: - file_list = [] - for root, dirs, files in os.walk(sub_dir): - for i0 in range(len(files)): - file_list.append(files[i0]) - count_file = len(file_list) - if smaller_than_num == None: - print(sub_dir) - print(count_file) - print() - else: - if count_file