127 lines
4.0 KiB
Python
127 lines
4.0 KiB
Python
# Module: data_processing
|
||
|
||
# 并行计算前的预处理,把参数分成多份
|
||
def preprocess_for_parallel_calculations(parameter_array_all, cpus=1, task_index=0):
|
||
import numpy as np
|
||
num_all = np.array(parameter_array_all).shape[0]
|
||
if num_all%cpus == 0:
|
||
num_parameter = int(num_all/cpus)
|
||
parameter_array = parameter_array_all[task_index*num_parameter:(task_index+1)*num_parameter]
|
||
else:
|
||
num_parameter = int(num_all/(cpus-1))
|
||
if task_index != cpus-1:
|
||
parameter_array = parameter_array_all[task_index*num_parameter:(task_index+1)*num_parameter]
|
||
else:
|
||
parameter_array = parameter_array_all[task_index*num_parameter:num_all]
|
||
return parameter_array
|
||
|
||
# 根据子数组的第index个元素对子数组进行排序(index从0开始)
|
||
def sort_array_by_index_element(original_array, index):
|
||
sorted_array = sorted(original_array, key=lambda x: x[index])
|
||
return sorted_array
|
||
|
||
# 随机获得一个整数,左闭右闭
|
||
def get_random_number(start=0, end=1):
|
||
import random
|
||
rand_number = random.randint(start, end) # 左闭右闭 [start, end]
|
||
return rand_number
|
||
|
||
# 选取一个种子生成固定的随机整数,左闭右开
|
||
def generate_random_int_number_for_a_specific_seed(seed=0, x_min=0, x_max=10):
|
||
import numpy as np
|
||
np.random.seed(seed)
|
||
rand_num = np.random.randint(x_min, x_max) # 左闭右开[x_min, x_max)
|
||
return rand_num
|
||
|
||
# 以显示编号的样式,打印数组
|
||
def print_array_with_index(array, show_index=1, index_type=0):
|
||
if show_index==0:
|
||
for i0 in array:
|
||
print(i0)
|
||
else:
|
||
if index_type==0:
|
||
index = 0
|
||
for i0 in array:
|
||
print(index, i0)
|
||
index += 1
|
||
else:
|
||
index = 0
|
||
for i0 in array:
|
||
index += 1
|
||
print(index, i0)
|
||
|
||
# 使用jieba软件包进行分词
|
||
def divide_text_into_words(text):
|
||
import jieba
|
||
words = jieba.lcut(text)
|
||
return words
|
||
|
||
# 根据一定的字符长度来分割文本
|
||
def split_text(text, wrap_width=3000):
|
||
import textwrap
|
||
split_text_list = textwrap.wrap(text, wrap_width)
|
||
return split_text_list
|
||
|
||
# 判断某个字符是中文还是英文或其他
|
||
def check_Chinese_or_English(a):
|
||
if '\u4e00' <= a <= '\u9fff' :
|
||
word_type = 'Chinese'
|
||
elif '\x00' <= a <= '\xff':
|
||
word_type = 'English'
|
||
else:
|
||
word_type = 'Others'
|
||
return word_type
|
||
|
||
# 统计中英文文本的字数,默认不包括空格
|
||
def count_words(text, include_space=0, show_words=0):
|
||
import jieba
|
||
import guan
|
||
words = jieba.lcut(text)
|
||
new_words = []
|
||
if include_space == 0:
|
||
for word in words:
|
||
if word != ' ':
|
||
new_words.append(word)
|
||
else:
|
||
new_words = words
|
||
num_words = 0
|
||
new_words_2 = []
|
||
for word in new_words:
|
||
word_type = guan.check_Chinese_or_English(word[0])
|
||
if word_type == 'Chinese':
|
||
num_words += len(word)
|
||
for one_word in word:
|
||
new_words_2.append(one_word)
|
||
elif word_type == 'English' or 'Others':
|
||
num_words += 1
|
||
new_words_2.append(word)
|
||
if show_words == 1:
|
||
print(new_words_2)
|
||
return num_words
|
||
|
||
# 将RGB转成HEX
|
||
def rgb_to_hex(rgb, pound=1):
|
||
if pound==0:
|
||
return '%02x%02x%02x' % rgb
|
||
else:
|
||
return '#%02x%02x%02x' % rgb
|
||
|
||
# 将HEX转成RGB
|
||
def hex_to_rgb(hex):
|
||
hex = hex.lstrip('#')
|
||
length = len(hex)
|
||
return tuple(int(hex[i:i+length//3], 16) for i in range(0, length, length//3))
|
||
|
||
# 使用MD5进行散列加密
|
||
def encryption_MD5(password, salt=''):
|
||
import hashlib
|
||
password = salt+password
|
||
hashed_password = hashlib.md5(password.encode()).hexdigest()
|
||
return hashed_password
|
||
|
||
# 使用SHA-256进行散列加密
|
||
def encryption_SHA_256(password, salt=''):
|
||
import hashlib
|
||
password = salt+password
|
||
hashed_password = hashlib.sha256(password.encode()).hexdigest()
|
||
return hashed_password |