TTY OV  1
A cross platform python terminal
Loading...
Searching...
No Matches
generate_db.py
Go to the documentation of this file.
1"""
2# +==== BEGIN polyguard =================+
3# LOGO:
4# input
5#
6# @#$%! hello
7# | |
8# +--+--+
9# |
10# v
11# +------------+
12# | POLY GUARD |
13# +------------+
14# | |
15# v v
16# BLOCKED PASSED
17# KO OK
18# /STOP
19# PROJECT: polyguard
20# FILE: generate_db.py
21# CREATION DATE: 21-03-2026
22# LAST Modified: 19:51:5 21-03-2026
23# DESCRIPTION:
24# A module that provides a set of swearwords to listen to when filtering while allowing to toggle on and off different languages.
25# Build-time helper to generate the SQLite DB from plaintext word lists.
26#
27# Usage (console script): polyguard-generate-db --source-dir ./wordlists --db-path <path>
28#
29# The script expects files named using language codes (e.g. `en_uk.txt`, `fr.txt`).
30# Files whose stem doesn't match a known `Langs` entry will be stored under `Langs.OTHER`.
31# /STOP
32# COPYRIGHT: (c) Henry Letellier
33# PURPOSE: This is the file in charge of generating the database file.
34# // AR
35# +==== END polyguard =================+
36"""
37
38import os
39from typing import Dict, Iterable, Optional
40import argparse
41from display_tty import Disp, initialise_logger
42
43from . import constants as POLY_CONST
44from .normalise import Normalise
45from .sqlite_handler import SQLiteHandler
46
47IDISP: Disp = initialise_logger("Generate DB", False)
48
49
50def build_db_from_dir(source_dir: str, db_path: str) -> int:
51 """Scan directory for .txt files and populate SQLite database.
52
53 Scans `source_dir` for .txt wordlist files, normalizes each one, and
54 inserts the words into the SQLite database at `db_path`. File stems are
55 mapped to Langs enum values; unmapped stems default to Langs.OTHER.
56
57 Args:
58 source_dir: Path to directory containing .txt wordlist files.
59 db_path: Path where SQLite database will be created or updated.
60
61 Returns:
62 int: Number of language files processed. Returns 0 if no files found.
63
64 Raises:
65 OSError: When source directory cannot be read.
66 sqlite3.Error: When database operations fail.
67 """
68 files = []
69
70 IDISP.log_info(f"Scanning source directory for .txt files: {source_dir}")
71 try:
72 entries = os.listdir(source_dir)
73 except OSError as exc:
74 IDISP.log_error(
75 f"Failed to list source directory '{source_dir}': {exc}"
76 )
77 raise
78
79 for entry in entries:
80 full = os.path.join(source_dir, entry)
81 if not os.path.isfile(full):
82 continue
83
84 if not entry.lower().endswith(".txt"):
85 continue
86
87 files.append(full)
88
89 if not files:
90 IDISP.log_info("No .txt wordlist files found; nothing to do")
91 return 0
92
93 # Map language enum values to Enum members for fast lookup
94 lang_map: Dict[str, POLY_CONST.Langs] = {}
95 for l in POLY_CONST.Langs:
96 lang_map[l.value] = l
97
98 mapping: Dict[POLY_CONST.Langs, Iterable[str]] = {}
99
100 for file_path in files:
101 stem = os.path.splitext(os.path.basename(file_path))[0]
102
103 # Determine language by the filename stem. Expected forms include
104 # language codes such as 'en_uk', 'fr', 'es'. If the stem does not
105 # match a known `Langs` value, the words are stored under
106 # `Langs.OTHER`.
107 lang_key = lang_map.get(stem, POLY_CONST.Langs.OTHER)
108 IDISP.log_info(
109 f"Processing file '{file_path}' -> language '{lang_key.value}'"
110 )
111
112 words = Normalise.load_from_file(file_path)
113
114 # Store as list for sqlite_handler.bulk_insert
115 mapping[lang_key] = list(words)
116
117 # Ensure data folder exists for DB path
118 os.makedirs(os.path.dirname(db_path), exist_ok=True)
119
120 IDISP.log_info(f"Creating/updating DB at: {db_path}")
121 handler = SQLiteHandler(db_path, readonly=False)
122 try:
123 handler.connect()
124 handler.create_schema()
125
126 inserted = handler.bulk_insert(mapping)
127 IDISP.log_info(
128 f"Bulk insert completed; inserted ~{inserted} rows (per-lang sums)"
129 )
130
131 finally:
132 handler.close()
133
134 return len(mapping)
135
136
137def main(argv: Optional[list[str]] = None) -> int:
138 """CLI entrypoint for database generation from wordlists.
139
140 Parses command-line arguments for source directory and output database path,
141 then invokes build_db_from_dir() to populate the database.
142
143 Args:
144 argv: Optional list of command-line arguments. If None, uses sys.argv.
145
146 Returns:
147 int: Exit code (0 for success).
148 """
149 parser = argparse.ArgumentParser(
150 description="Generate polyguard SQLite DB from text lists")
151
152 parser.add_argument(
153 "--source-dir",
154 default=POLY_CONST.DEFAULT_SOURCE_WORDS,
155 help="Directory containing newline-delimited .txt word lists"
156 )
157 parser.add_argument(
158 "--db-path",
159 default=POLY_CONST.DEFAULT_DB_PATH,
160 help="Path to the SQLite DB to create"
161 )
162
163 args = parser.parse_args(argv)
164
165 count = build_db_from_dir(args.source_dir, args.db_path)
166
167 IDISP.log_info(f"Processed {count} language files into {args.db_path}")
168 print(f"Processed {count} language files into {args.db_path}")
169
170 return 0
171
172
173if __name__ == "__main__":
174 raise SystemExit(main())
int build_db_from_dir(str source_dir, str db_path)
int main(Optional[list[str]] argv=None)