Migrate to gitea

2026-02-13 00:10:40 +01:00
commit 6d06a9e14e
38 changed files with 31427 additions and 0 deletions
--- a/scripts/search_word.py
+++ b/scripts/search_word.py
@@ -0,0 +1,110 @@
+import json
+import pathlib
+from datetime import datetime
+
+
+INPUT_FILE_NAME = "fr-raw-wiktextract-data.jsonl"  # <-- Update this to your file
+# --- Dynamic Path Setup ---
+SCRIPT_DIR = pathlib.Path(__file__).parent
+ROOT_DIR = SCRIPT_DIR.parent
+INPUT_FILE = ROOT_DIR / "raw_data" / INPUT_FILE_NAME
+
+
+# --- Filter Configuration ---
+# Set the POS (part of speech) you want to filter for
+# Examples: "noun", "verb", "adj", "adv", etc.
+# Set to None to skip POS filtering
+FILTER_POS = "noun"
+
+# Set the word you want to filter for
+# Set to None to skip word filtering
+FILTER_WORD = "grenouille"
+
+# Set word prefix to filter for (e.g., "Septem" will match "September")
+# Set to None to skip prefix filtering
+FILTER_PREFIX = None
+
+# Set word suffix to filter for (e.g., "ber" will match "September")
+# Set to None to skip suffix filtering
+FILTER_SUFFIX = None
+
+# Maximum number of results to include (set to None for unlimited)
+MAX_RESULTS = 5
+
+
+def matches_filters(entry):
+    """Check if an entry matches all active filters."""
+    
+    # Filter by POS
+    if FILTER_POS is not None:
+        if entry.get("pos") != FILTER_POS:
+            return False
+    
+    # Filter by exact word
+    if FILTER_WORD is not None:
+        if entry.get("word") != FILTER_WORD:
+            return False
+    
+    # Filter by prefix
+    if FILTER_PREFIX is not None:
+        word = entry.get("word", "")
+        if not word.startswith(FILTER_PREFIX):
+            return False
+    
+    # Filter by suffix
+    if FILTER_SUFFIX is not None:
+        word = entry.get("word", "")
+        if not word.endswith(FILTER_SUFFIX):
+            return False
+    
+    return True
+
+
+def filter_and_save(file_path):
+    """Filter JSONL file and save matching entries."""
+    
+    # Generate output filename with original filename and timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = file_path.parent / f"{file_path.stem}_filtered_{timestamp}.jsonl"
+    
+    match_count = 0
+    total_lines = 0
+    
+    with open(file_path, 'r', encoding='utf-8') as infile:
+        with open(output_file, 'w', encoding='utf-8') as outfile:
+            for line in infile:
+                total_lines += 1
+                
+                try:
+                    entry = json.loads(line)
+                    
+                    # Check if entry matches filters
+                    if matches_filters(entry):
+                        outfile.write(json.dumps(entry, ensure_ascii=False))
+                        outfile.write('\n')
+                        match_count += 1
+                        
+                        # Stop if we've reached max results
+                        if MAX_RESULTS is not None and match_count >= MAX_RESULTS:
+                            break
+                            
+                except json.JSONDecodeError:
+                    print(f"Warning: Line {total_lines} is not valid JSON.")
+    
+    print(f"Filtered {match_count} entries from {total_lines} total lines")
+    print(f"Output written to: {output_file}")
+    
+    # Print active filters
+    print("\nActive filters:")
+    if FILTER_POS:
+        print(f"  - POS: {FILTER_POS}")
+    if FILTER_WORD:
+        print(f"  - Word (exact): {FILTER_WORD}")
+    if FILTER_PREFIX:
+        print(f"  - Prefix: {FILTER_PREFIX}")
+    if FILTER_SUFFIX:
+        print(f"  - Suffix: {FILTER_SUFFIX}")
+
+
+if __name__ == "__main__":
+    filter_and_save(INPUT_FILE)