51 """Scan directory for .txt files and populate SQLite database.
53 Scans `source_dir` for .txt wordlist files, normalizes each one, and
54 inserts the words into the SQLite database at `db_path`. File stems are
55 mapped to Langs enum values; unmapped stems default to Langs.OTHER.
58 source_dir: Path to directory containing .txt wordlist files.
59 db_path: Path where SQLite database will be created or updated.
62 int: Number of language files processed. Returns 0 if no files found.
65 OSError: When source directory cannot be read.
66 sqlite3.Error: When database operations fail.
70 IDISP.log_info(f
"Scanning source directory for .txt files: {source_dir}")
72 entries = os.listdir(source_dir)
73 except OSError
as exc:
75 f
"Failed to list source directory '{source_dir}': {exc}"
80 full = os.path.join(source_dir, entry)
81 if not os.path.isfile(full):
84 if not entry.lower().endswith(
".txt"):
90 IDISP.log_info(
"No .txt wordlist files found; nothing to do")
94 lang_map: Dict[str, POLY_CONST.Langs] = {}
95 for l
in POLY_CONST.Langs:
98 mapping: Dict[POLY_CONST.Langs, Iterable[str]] = {}
100 for file_path
in files:
101 stem = os.path.splitext(os.path.basename(file_path))[0]
107 lang_key = lang_map.get(stem, POLY_CONST.Langs.OTHER)
109 f
"Processing file '{file_path}' -> language '{lang_key.value}'"
112 words = Normalise.load_from_file(file_path)
115 mapping[lang_key] = list(words)
118 os.makedirs(os.path.dirname(db_path), exist_ok=
True)
120 IDISP.log_info(f
"Creating/updating DB at: {db_path}")
124 handler.create_schema()
126 inserted = handler.bulk_insert(mapping)
128 f
"Bulk insert completed; inserted ~{inserted} rows (per-lang sums)"
137def main(argv: Optional[list[str]] =
None) -> int:
138 """CLI entrypoint for database generation from wordlists.
140 Parses command-line arguments for source directory and output database path,
141 then invokes build_db_from_dir() to populate the database.
144 argv: Optional list of command-line arguments. If None, uses sys.argv.
147 int: Exit code (0 for success).
149 parser = argparse.ArgumentParser(
150 description=
"Generate polyguard SQLite DB from text lists")
154 default=POLY_CONST.DEFAULT_SOURCE_WORDS,
155 help=
"Directory containing newline-delimited .txt word lists"
159 default=POLY_CONST.DEFAULT_DB_PATH,
160 help=
"Path to the SQLite DB to create"
163 args = parser.parse_args(argv)
167 IDISP.log_info(f
"Processed {count} language files into {args.db_path}")
168 print(f
"Processed {count} language files into {args.db_path}")