update

2025-06-20 21:31:38 +08:00
parent 35c87df603
commit 0db8b55a3f
36 changed files with 4434 additions and 0 deletions
--- a/2024.01.27_chat.guanjihuan.com/使用
+++ b/2024.01.27_chat.guanjihuan.com/使用
@@ -0,0 +1,142 @@
+import argparse
+import json
+import os
+import sys
+
+import numpy as np
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+model_path = os.path.join(current_dir, "V7_sft.model")
+sys.path.append(os.path.join(current_dir, "transformers"))
+from tokenization_internlm import InternLMTokenizer
+
+tokenizer = InternLMTokenizer(vocab_file=model_path)
+
+
+def write_bin(context: str, bin_file) -> None:
+    """
+    Write bin file based on the context.
+
+    Args:
+        context (str): the context of raw file.
+        bin_file (file handler): the opened bin file.
+
+    Example:
+    >>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt'
+    >>> out.bin
+    >>> {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]}
+    """
+    # encode the context into tokens, which is a list, eg. [67577, 69095, 63010, 61770, 67783, 69301, 74732]
+    tokens = tokenizer.encode(context)
+    # transfer the list into dic, key is str 'tokens', value is tokens.
+    # eg. {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]}
+    data = dict(tokens=tokens)
+    # encode the data into bytes to save
+    saved_bin = str.encode(json.dumps(data) + "\n")
+
+    # write bytes into bin_file
+    bin_file.write(saved_bin)
+
+
+def prepare_meta(bin_output_path: str):
+    """
+    Prepare metadata for the given bin file.
+
+    Args:
+        bin_output_path (str): Output bin file path.
+    """
+    meta = []
+    cur = 0
+    with open(bin_output_path, "rb") as f:
+        while True:
+            # read lines
+            line = f.readline()
+            # if line is empty, then break
+            if line == b"":
+                break
+            # obtain the token amount of each line
+            length = len(json.loads(line)["tokens"])
+            # meta is a list of tuple(cur, length)
+            # cur: the start index of each line
+            # length: the token amount of each line
+            meta.append((cur, length))
+            # update the cur to generate the meta information of next line
+            cur += len(line)
+
+    # define path of the generated meta file
+    meta_fp = bin_output_path + ".meta"
+    # save the generated meta information
+    with open(meta_fp, "wb") as f:
+        meta = np.array(meta, dtype=np.int32)
+        np.save(f, meta)
+
+
+def text2bin(text_input_path: str, bin_output_path: str):
+    """
+    Read content from the input file and write to bin file.
+    Currently support 3 input formats: 'txt', 'json' and 'jsonl'.
+
+    Args:
+        text_input_path (str): txt file path.
+        bin_output_path (str): output bin file path.
+    """
+    # Check if the txt file exists
+    if not os.path.isfile(text_input_path):
+        raise FileNotFoundError(f"{text_input_path} does not exist.")
+
+    file_format = text_input_path.split(".")[-1]
+    assert file_format in ["txt", "json", "jsonl"], print(
+        "Invalid input file type. Currently support `txt`, `json` and `jsonl`."
+    )
+
+    with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file:
+        if file_format == "txt":
+            for line in text_file:
+                # Strip any leading/trailing whitespace
+                stripped_line = line.strip()
+                if stripped_line:
+                    # Pass each line to the write_bin function
+                    write_bin(stripped_line, bin_file)
+
+        elif file_format == "json":
+            data = json.load(text_file)
+            # assuming data is a list of dictionaries
+            for record in data:
+                # the type of record is dict, transfer the dict into str
+                context = json.dumps(record)
+                # encode the str and write into bin
+                write_bin(context, bin_file)
+
+        elif file_format == "jsonl":
+            for line in text_file:
+                # encode the str and write into bin
+                write_bin(line, bin_file)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--text_input_path",
+        type=str,
+        required=True,
+        help="Path to the input text file.",
+    )
+    parser.add_argument("--bin_output_path", type=str, required=True, help="Path to the output bin file.")
+
+    return parser.parse_args()
+
+
+def main():
+    # parse arguments
+    args = parse_args()
+
+    text2bin(args.text_input_path, args.bin_output_path)
+    print(f"Successfully converted {args.text_input_path} to {args.bin_output_path}")
+
+    # To avoid potential read/write errors, the metadata preparation follows after creating the .bin file.
+    prepare_meta(args.bin_output_path)
+    print(f"Successfully generated {args.bin_output_path}.meta")
+
+
+if __name__ == "__main__":
+    main()