update
This commit is contained in:
@@ -0,0 +1,142 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
model_path = os.path.join(current_dir, "V7_sft.model")
|
||||
sys.path.append(os.path.join(current_dir, "transformers"))
|
||||
from tokenization_internlm import InternLMTokenizer
|
||||
|
||||
tokenizer = InternLMTokenizer(vocab_file=model_path)
|
||||
|
||||
|
||||
def write_bin(context: str, bin_file) -> None:
|
||||
"""
|
||||
Write bin file based on the context.
|
||||
|
||||
Args:
|
||||
context (str): the context of raw file.
|
||||
bin_file (file handler): the opened bin file.
|
||||
|
||||
Example:
|
||||
>>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt'
|
||||
>>> out.bin
|
||||
>>> {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]}
|
||||
"""
|
||||
# encode the context into tokens, which is a list, eg. [67577, 69095, 63010, 61770, 67783, 69301, 74732]
|
||||
tokens = tokenizer.encode(context)
|
||||
# transfer the list into dic, key is str 'tokens', value is tokens.
|
||||
# eg. {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]}
|
||||
data = dict(tokens=tokens)
|
||||
# encode the data into bytes to save
|
||||
saved_bin = str.encode(json.dumps(data) + "\n")
|
||||
|
||||
# write bytes into bin_file
|
||||
bin_file.write(saved_bin)
|
||||
|
||||
|
||||
def prepare_meta(bin_output_path: str):
|
||||
"""
|
||||
Prepare metadata for the given bin file.
|
||||
|
||||
Args:
|
||||
bin_output_path (str): Output bin file path.
|
||||
"""
|
||||
meta = []
|
||||
cur = 0
|
||||
with open(bin_output_path, "rb") as f:
|
||||
while True:
|
||||
# read lines
|
||||
line = f.readline()
|
||||
# if line is empty, then break
|
||||
if line == b"":
|
||||
break
|
||||
# obtain the token amount of each line
|
||||
length = len(json.loads(line)["tokens"])
|
||||
# meta is a list of tuple(cur, length)
|
||||
# cur: the start index of each line
|
||||
# length: the token amount of each line
|
||||
meta.append((cur, length))
|
||||
# update the cur to generate the meta information of next line
|
||||
cur += len(line)
|
||||
|
||||
# define path of the generated meta file
|
||||
meta_fp = bin_output_path + ".meta"
|
||||
# save the generated meta information
|
||||
with open(meta_fp, "wb") as f:
|
||||
meta = np.array(meta, dtype=np.int32)
|
||||
np.save(f, meta)
|
||||
|
||||
|
||||
def text2bin(text_input_path: str, bin_output_path: str):
|
||||
"""
|
||||
Read content from the input file and write to bin file.
|
||||
Currently support 3 input formats: 'txt', 'json' and 'jsonl'.
|
||||
|
||||
Args:
|
||||
text_input_path (str): txt file path.
|
||||
bin_output_path (str): output bin file path.
|
||||
"""
|
||||
# Check if the txt file exists
|
||||
if not os.path.isfile(text_input_path):
|
||||
raise FileNotFoundError(f"{text_input_path} does not exist.")
|
||||
|
||||
file_format = text_input_path.split(".")[-1]
|
||||
assert file_format in ["txt", "json", "jsonl"], print(
|
||||
"Invalid input file type. Currently support `txt`, `json` and `jsonl`."
|
||||
)
|
||||
|
||||
with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file:
|
||||
if file_format == "txt":
|
||||
for line in text_file:
|
||||
# Strip any leading/trailing whitespace
|
||||
stripped_line = line.strip()
|
||||
if stripped_line:
|
||||
# Pass each line to the write_bin function
|
||||
write_bin(stripped_line, bin_file)
|
||||
|
||||
elif file_format == "json":
|
||||
data = json.load(text_file)
|
||||
# assuming data is a list of dictionaries
|
||||
for record in data:
|
||||
# the type of record is dict, transfer the dict into str
|
||||
context = json.dumps(record)
|
||||
# encode the str and write into bin
|
||||
write_bin(context, bin_file)
|
||||
|
||||
elif file_format == "jsonl":
|
||||
for line in text_file:
|
||||
# encode the str and write into bin
|
||||
write_bin(line, bin_file)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--text_input_path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the input text file.",
|
||||
)
|
||||
parser.add_argument("--bin_output_path", type=str, required=True, help="Path to the output bin file.")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
# parse arguments
|
||||
args = parse_args()
|
||||
|
||||
text2bin(args.text_input_path, args.bin_output_path)
|
||||
print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}")
|
||||
|
||||
# To avoid potential read/write errors, the metadata preparation follows after creating the .bin file.
|
||||
prepare_meta(args.bin_output_path)
|
||||
print(f"Successfully generated {args.bin_output_path}.meta")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user