update
This commit is contained in:
		| @@ -0,0 +1,142 @@ | ||||
| import argparse | ||||
| import json | ||||
| import os | ||||
| import sys | ||||
|  | ||||
| import numpy as np | ||||
|  | ||||
| current_dir = os.path.dirname(os.path.abspath(__file__)) | ||||
| model_path = os.path.join(current_dir, "V7_sft.model") | ||||
| sys.path.append(os.path.join(current_dir, "transformers")) | ||||
| from tokenization_internlm import InternLMTokenizer | ||||
|  | ||||
| tokenizer = InternLMTokenizer(vocab_file=model_path) | ||||
|  | ||||
|  | ||||
| def write_bin(context: str, bin_file) -> None: | ||||
|     """ | ||||
|     Write bin file based on the context. | ||||
|  | ||||
|     Args: | ||||
|         context (str): the context of raw file. | ||||
|         bin_file (file handler): the opened bin file. | ||||
|  | ||||
|     Example: | ||||
|     >>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt' | ||||
|     >>> out.bin | ||||
|     >>> {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]} | ||||
|     """ | ||||
|     # encode the context into tokens, which is a list, eg. [67577, 69095, 63010, 61770, 67783, 69301, 74732] | ||||
|     tokens = tokenizer.encode(context) | ||||
|     # transfer the list into dic, key is str 'tokens', value is tokens. | ||||
|     # eg. {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]} | ||||
|     data = dict(tokens=tokens) | ||||
|     # encode the data into bytes to save | ||||
|     saved_bin = str.encode(json.dumps(data) + "\n") | ||||
|  | ||||
|     # write bytes into bin_file | ||||
|     bin_file.write(saved_bin) | ||||
|  | ||||
|  | ||||
| def prepare_meta(bin_output_path: str): | ||||
|     """ | ||||
|     Prepare metadata for the given bin file. | ||||
|  | ||||
|     Args: | ||||
|         bin_output_path (str): Output bin file path. | ||||
|     """ | ||||
|     meta = [] | ||||
|     cur = 0 | ||||
|     with open(bin_output_path, "rb") as f: | ||||
|         while True: | ||||
|             # read lines | ||||
|             line = f.readline() | ||||
|             # if line is empty, then break | ||||
|             if line == b"": | ||||
|                 break | ||||
|             # obtain the token amount of each line | ||||
|             length = len(json.loads(line)["tokens"]) | ||||
|             # meta is a list of tuple(cur, length) | ||||
|             # cur: the start index of each line | ||||
|             # length: the token amount of each line | ||||
|             meta.append((cur, length)) | ||||
|             # update the cur to generate the meta information of next line | ||||
|             cur += len(line) | ||||
|  | ||||
|     # define path of the generated meta file | ||||
|     meta_fp = bin_output_path + ".meta" | ||||
|     # save the generated meta information | ||||
|     with open(meta_fp, "wb") as f: | ||||
|         meta = np.array(meta, dtype=np.int32) | ||||
|         np.save(f, meta) | ||||
|  | ||||
|  | ||||
| def text2bin(text_input_path: str, bin_output_path: str): | ||||
|     """ | ||||
|     Read content from the input file and write to bin file. | ||||
|     Currently support 3 input formats: 'txt', 'json' and 'jsonl'. | ||||
|  | ||||
|     Args: | ||||
|         text_input_path (str): txt file path. | ||||
|         bin_output_path (str): output bin file path. | ||||
|     """ | ||||
|     # Check if the txt file exists | ||||
|     if not os.path.isfile(text_input_path): | ||||
|         raise FileNotFoundError(f"{text_input_path} does not exist.") | ||||
|  | ||||
|     file_format = text_input_path.split(".")[-1] | ||||
|     assert file_format in ["txt", "json", "jsonl"], print( | ||||
|         "Invalid input file type. Currently support `txt`, `json` and `jsonl`." | ||||
|     ) | ||||
|  | ||||
|     with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file: | ||||
|         if file_format == "txt": | ||||
|             for line in text_file: | ||||
|                 # Strip any leading/trailing whitespace | ||||
|                 stripped_line = line.strip() | ||||
|                 if stripped_line: | ||||
|                     # Pass each line to the write_bin function | ||||
|                     write_bin(stripped_line, bin_file) | ||||
|  | ||||
|         elif file_format == "json": | ||||
|             data = json.load(text_file) | ||||
|             # assuming data is a list of dictionaries | ||||
|             for record in data: | ||||
|                 # the type of record is dict, transfer the dict into str | ||||
|                 context = json.dumps(record) | ||||
|                 # encode the str and write into bin | ||||
|                 write_bin(context, bin_file) | ||||
|  | ||||
|         elif file_format == "jsonl": | ||||
|             for line in text_file: | ||||
|                 # encode the str and write into bin | ||||
|                 write_bin(line, bin_file) | ||||
|  | ||||
|  | ||||
| def parse_args(): | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument( | ||||
|         "--text_input_path", | ||||
|         type=str, | ||||
|         required=True, | ||||
|         help="Path to the input text file.", | ||||
|     ) | ||||
|     parser.add_argument("--bin_output_path", type=str, required=True, help="Path to the output bin file.") | ||||
|  | ||||
|     return parser.parse_args() | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|     # parse arguments | ||||
|     args = parse_args() | ||||
|  | ||||
|     text2bin(args.text_input_path, args.bin_output_path) | ||||
|     print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}") | ||||
|  | ||||
|     # To avoid potential read/write errors, the metadata preparation follows after creating the .bin file. | ||||
|     prepare_meta(args.bin_output_path) | ||||
|     print(f"Successfully generated {args.bin_output_path}.meta") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
		Reference in New Issue
	
	Block a user