143 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			143 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import argparse
 | |
| import json
 | |
| import os
 | |
| import sys
 | |
| 
 | |
| import numpy as np
 | |
| 
 | |
| current_dir = os.path.dirname(os.path.abspath(__file__))
 | |
| model_path = os.path.join(current_dir, "V7_sft.model")
 | |
| sys.path.append(os.path.join(current_dir, "transformers"))
 | |
| from tokenization_internlm import InternLMTokenizer
 | |
| 
 | |
| tokenizer = InternLMTokenizer(vocab_file=model_path)
 | |
| 
 | |
| 
 | |
| def write_bin(context: str, bin_file) -> None:
 | |
|     """
 | |
|     Write bin file based on the context.
 | |
| 
 | |
|     Args:
 | |
|         context (str): the context of raw file.
 | |
|         bin_file (file handler): the opened bin file.
 | |
| 
 | |
|     Example:
 | |
|     >>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt'
 | |
|     >>> out.bin
 | |
|     >>> {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]}
 | |
|     """
 | |
|     # encode the context into tokens, which is a list, eg. [67577, 69095, 63010, 61770, 67783, 69301, 74732]
 | |
|     tokens = tokenizer.encode(context)
 | |
|     # transfer the list into dic, key is str 'tokens', value is tokens.
 | |
|     # eg. {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]}
 | |
|     data = dict(tokens=tokens)
 | |
|     # encode the data into bytes to save
 | |
|     saved_bin = str.encode(json.dumps(data) + "\n")
 | |
| 
 | |
|     # write bytes into bin_file
 | |
|     bin_file.write(saved_bin)
 | |
| 
 | |
| 
 | |
| def prepare_meta(bin_output_path: str):
 | |
|     """
 | |
|     Prepare metadata for the given bin file.
 | |
| 
 | |
|     Args:
 | |
|         bin_output_path (str): Output bin file path.
 | |
|     """
 | |
|     meta = []
 | |
|     cur = 0
 | |
|     with open(bin_output_path, "rb") as f:
 | |
|         while True:
 | |
|             # read lines
 | |
|             line = f.readline()
 | |
|             # if line is empty, then break
 | |
|             if line == b"":
 | |
|                 break
 | |
|             # obtain the token amount of each line
 | |
|             length = len(json.loads(line)["tokens"])
 | |
|             # meta is a list of tuple(cur, length)
 | |
|             # cur: the start index of each line
 | |
|             # length: the token amount of each line
 | |
|             meta.append((cur, length))
 | |
|             # update the cur to generate the meta information of next line
 | |
|             cur += len(line)
 | |
| 
 | |
|     # define path of the generated meta file
 | |
|     meta_fp = bin_output_path + ".meta"
 | |
|     # save the generated meta information
 | |
|     with open(meta_fp, "wb") as f:
 | |
|         meta = np.array(meta, dtype=np.int32)
 | |
|         np.save(f, meta)
 | |
| 
 | |
| 
 | |
| def text2bin(text_input_path: str, bin_output_path: str):
 | |
|     """
 | |
|     Read content from the input file and write to bin file.
 | |
|     Currently support 3 input formats: 'txt', 'json' and 'jsonl'.
 | |
| 
 | |
|     Args:
 | |
|         text_input_path (str): txt file path.
 | |
|         bin_output_path (str): output bin file path.
 | |
|     """
 | |
|     # Check if the txt file exists
 | |
|     if not os.path.isfile(text_input_path):
 | |
|         raise FileNotFoundError(f"{text_input_path} does not exist.")
 | |
| 
 | |
|     file_format = text_input_path.split(".")[-1]
 | |
|     assert file_format in ["txt", "json", "jsonl"], print(
 | |
|         "Invalid input file type. Currently support `txt`, `json` and `jsonl`."
 | |
|     )
 | |
| 
 | |
|     with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file:
 | |
|         if file_format == "txt":
 | |
|             for line in text_file:
 | |
|                 # Strip any leading/trailing whitespace
 | |
|                 stripped_line = line.strip()
 | |
|                 if stripped_line:
 | |
|                     # Pass each line to the write_bin function
 | |
|                     write_bin(stripped_line, bin_file)
 | |
| 
 | |
|         elif file_format == "json":
 | |
|             data = json.load(text_file)
 | |
|             # assuming data is a list of dictionaries
 | |
|             for record in data:
 | |
|                 # the type of record is dict, transfer the dict into str
 | |
|                 context = json.dumps(record)
 | |
|                 # encode the str and write into bin
 | |
|                 write_bin(context, bin_file)
 | |
| 
 | |
|         elif file_format == "jsonl":
 | |
|             for line in text_file:
 | |
|                 # encode the str and write into bin
 | |
|                 write_bin(line, bin_file)
 | |
| 
 | |
| 
 | |
| def parse_args():
 | |
|     parser = argparse.ArgumentParser()
 | |
|     parser.add_argument(
 | |
|         "--text_input_path",
 | |
|         type=str,
 | |
|         required=True,
 | |
|         help="Path to the input text file.",
 | |
|     )
 | |
|     parser.add_argument("--bin_output_path", type=str, required=True, help="Path to the output bin file.")
 | |
| 
 | |
|     return parser.parse_args()
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     # parse arguments
 | |
|     args = parse_args()
 | |
| 
 | |
|     text2bin(args.text_input_path, args.bin_output_path)
 | |
|     print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}")
 | |
| 
 | |
|     # To avoid potential read/write errors, the metadata preparation follows after creating the .bin file.
 | |
|     prepare_meta(args.bin_output_path)
 | |
|     print(f"Successfully generated {args.bin_output_path}.meta")
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 |