py.guanjihuan.com/PyPI/src/guan/data_processing.py

127 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Module: data_processing
# 并行计算前的预处理,把参数分成多份
def preprocess_for_parallel_calculations(parameter_array_all, cpus=1, task_index=0):
import numpy as np
num_all = np.array(parameter_array_all).shape[0]
if num_all%cpus == 0:
num_parameter = int(num_all/cpus)
parameter_array = parameter_array_all[task_index*num_parameter:(task_index+1)*num_parameter]
else:
num_parameter = int(num_all/(cpus-1))
if task_index != cpus-1:
parameter_array = parameter_array_all[task_index*num_parameter:(task_index+1)*num_parameter]
else:
parameter_array = parameter_array_all[task_index*num_parameter:num_all]
return parameter_array
# 根据子数组的第index个元素对子数组进行排序index从0开始
def sort_array_by_index_element(original_array, index):
sorted_array = sorted(original_array, key=lambda x: x[index])
return sorted_array
# 随机获得一个整数,左闭右闭
def get_random_number(start=0, end=1):
import random
rand_number = random.randint(start, end) # 左闭右闭 [start, end]
return rand_number
# 选取一个种子生成固定的随机整数,左闭右开
def generate_random_int_number_for_a_specific_seed(seed=0, x_min=0, x_max=10):
import numpy as np
np.random.seed(seed)
rand_num = np.random.randint(x_min, x_max) # 左闭右开[x_min, x_max)
return rand_num
# 以显示编号的样式,打印数组
def print_array_with_index(array, show_index=1, index_type=0):
if show_index==0:
for i0 in array:
print(i0)
else:
if index_type==0:
index = 0
for i0 in array:
print(index, i0)
index += 1
else:
index = 0
for i0 in array:
index += 1
print(index, i0)
# 使用jieba软件包进行分词
def divide_text_into_words(text):
import jieba
words = jieba.lcut(text)
return words
# 根据一定的字符长度来分割文本
def split_text(text, wrap_width=3000):
import textwrap
split_text_list = textwrap.wrap(text, wrap_width)
return split_text_list
# 判断某个字符是中文还是英文或其他
def check_Chinese_or_English(a):
if '\u4e00' <= a <= '\u9fff' :
word_type = 'Chinese'
elif '\x00' <= a <= '\xff':
word_type = 'English'
else:
word_type = 'Others'
return word_type
# 统计中英文文本的字数,默认不包括空格
def count_words(text, include_space=0, show_words=0):
import jieba
import guan
words = jieba.lcut(text)
new_words = []
if include_space == 0:
for word in words:
if word != ' ':
new_words.append(word)
else:
new_words = words
num_words = 0
new_words_2 = []
for word in new_words:
word_type = guan.check_Chinese_or_English(word[0])
if word_type == 'Chinese':
num_words += len(word)
for one_word in word:
new_words_2.append(one_word)
elif word_type == 'English' or 'Others':
num_words += 1
new_words_2.append(word)
if show_words == 1:
print(new_words_2)
return num_words
# 将RGB转成HEX
def rgb_to_hex(rgb, pound=1):
if pound==0:
return '%02x%02x%02x' % rgb
else:
return '#%02x%02x%02x' % rgb
# 将HEX转成RGB
def hex_to_rgb(hex):
hex = hex.lstrip('#')
length = len(hex)
return tuple(int(hex[i:i+length//3], 16) for i in range(0, length, length//3))
# 使用MD5进行散列加密
def encryption_MD5(password, salt=''):
import hashlib
password = salt+password
hashed_password = hashlib.md5(password.encode()).hexdigest()
return hashed_password
# 使用SHA-256进行散列加密
def encryption_SHA_256(password, salt=''):
import hashlib
password = salt+password
hashed_password = hashlib.sha256(password.encode()).hexdigest()
return hashed_password