Migrate to gitea
This commit is contained in:
38
scripts/printline.py
Normal file
38
scripts/printline.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import json
|
||||
import pathlib
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
INPUT_FILE_NAME = "fr_raw-wiktextract-data.jsonl"
|
||||
SCRIPT_DIR = pathlib.Path(__file__).parent
|
||||
ROOT_DIR = SCRIPT_DIR.parent
|
||||
INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
|
||||
|
||||
|
||||
|
||||
# --- Configuration ---
|
||||
START_LINE = 99 # 1-based index (first line is 1)
|
||||
NUM_LINES = 99 # Number of lines/objects to write
|
||||
|
||||
|
||||
def extract_lines_to_file(file_path, start_line, num_lines):
|
||||
# Generate timestamp filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
output_file = file_path.parent / f"{timestamp}.json"
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as infile:
|
||||
with open(output_file, 'w', encoding='utf-8') as outfile:
|
||||
for i, line in enumerate(infile, start=1):
|
||||
if i >= start_line and i < start_line + num_lines:
|
||||
try:
|
||||
element = json.loads(line)
|
||||
outfile.write(json.dumps(element, indent=2, ensure_ascii=False))
|
||||
outfile.write('\n')
|
||||
except json.JSONDecodeError:
|
||||
outfile.write(f"Error: Line {i} is not valid JSON.\n")
|
||||
|
||||
print(f"Output written to: {output_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_lines_to_file(INPUT_FILE, START_LINE, NUM_LINES)
|
||||
Reference in New Issue
Block a user