Cat Feeder  1.0.0
The Cat feeder project
Loading...
Searching...
No Matches
document_to_document.py
Go to the documentation of this file.
1r"""
2# +==== BEGIN CatFeeder =================+
3# LOGO:
4# ..............(..../\
5# ...............)..(.')
6# ..............(../..)
7# ...............\‍(__)|
8# Inspired by Joan Stark
9# source https://www.asciiart.eu/
10# animals/cats
11# /STOP
12# PROJECT: CatFeeder
13# FILE: document_to_document.py
14# CREATION DATE: 15-01-2026
15# LAST Modified: 1:32:47 17-01-2026
16# DESCRIPTION:
17# This is the backend server in charge of making the actual website work.
18# /STOP
19# COPYRIGHT: (c) Cat Feeder
20# PURPOSE: The file containing the code for converting documents from one format to another.
21# // AR
22# +==== END CatFeeder =================+
23"""
24
25from io import BytesIO
26from typing import Optional, Set
27from pathlib import Path
28import tempfile
29
30from display_tty import Disp, initialise_logger
31
32# Document processing libraries
33import pypandoc
34
35from .aliases import DOCUMENT_FORMAT_ALIASES
36from . import converters_constants as CONV_CONST
37
38from ..http_constants import DataTypes, MEDIA_TYPES
39
40from ...core import FinalClass
41from ...utils import CONST
42from ...tinytex import TinyTeXInstaller
43
44
45class DocumentToDocument(metaclass=FinalClass):
46 """Class used to convert documents from one format to another using Pandoc.
47
48 Optimized to minimize I/O costs:
49 - Text-based formats (MD, HTML, JSON, etc.) converted entirely in-memory (zero I/O)
50 - Binary formats (PDF, DOCX, etc.) require file I/O (billable operations)
51 """
52
53 disp: Disp = initialise_logger(__qualname__, CONST.DEBUG)
54
55 _instance: Optional["DocumentToDocument"] = None
56
57 _downloaded_tinytex: bool = False
58
59 # Text-based formats that can be processed entirely in-memory (zero I/O cost)
60 _TEXT_BASED_FORMATS: Set[DataTypes] = {
61 DataTypes.TXT, DataTypes.TEXT, DataTypes.PLAIN,
62 DataTypes.MARKDOWN, DataTypes.MD,
63 DataTypes.HTML, DataTypes.XML, DataTypes.XHTML,
64 DataTypes.JSON, DataTypes.YAML, DataTypes.YML,
65 DataTypes.TOML, DataTypes.CSV,
66 DataTypes.RSS, DataTypes.ATOM,
67 }
68
69 def __new__(cls) -> "DocumentToDocument":
70 if cls._instance is None:
71 cls._instance = super(DocumentToDocument, cls).__new__(cls)
72 return cls._instance
73
74 def __init__(self) -> None:
75 self.disp.log_debug("Initialising...")
76 self.disp.log_debug("Checking for TinyTeX dependencies...")
77 self._ensure_tinytex()
78 self.disp.log_debug("TinyTeX dependencies satisfied.")
79 self.disp.log_debug("Initialised.")
80
81 def __call__(self, data: bytes, source_format: DataTypes) -> CONV_CONST.ConversionResult:
82 return self.document_to_document(data, source_format)
83
84 def _ensure_tinytex(self) -> None:
85 """Ensure TinyTeX is installed for LaTeX/PDF operations."""
86 if not self._downloaded_tinytex:
87 self.disp.log_debug("Installing TinyTeX...")
88 try:
89 path = TinyTeXInstaller("TinyTeX-1").install()
90 self._downloaded_tinytex = True
91 self.disp.log_debug(f"TinyTeX installed at: {path}")
92 except Exception as e:
93 self.disp.log_warning(f"TinyTeX installation failed: {e}")
94 self._downloaded_tinytex = False
95
96 def get_document_extension(self, data_type: DataTypes) -> Optional[str]:
97 """
98 Get the file extension for a given document DataType.
99
100 Args:
101 data_type: The DataType to get extension for
102
103 Returns:
104 Extension string or None
105 """
106 return DOCUMENT_FORMAT_ALIASES.get(data_type)
107
108 def _is_text_based(self, data_type: DataTypes) -> bool:
109 """Check if a format is text-based and can use in-memory conversion (zero I/O)."""
110 return data_type in self._TEXT_BASED_FORMATS
111
112 def _can_use_memory_conversion(self, source: DataTypes, dest: DataTypes) -> bool:
113 """Check if conversion can be done entirely in memory (zero I/O cost)."""
114 return self._is_text_based(source) and self._is_text_based(dest)
115
117 self,
118 data: bytes,
119 source_format: DataTypes
120 ) -> tuple[Optional[DataTypes], Optional[str], Optional[str]]:
121 """
122 Validate conversion parameters and get destination format and extensions.
123
124 Args:
125 data: The document data to validate
126 source_format: The source document format
127
128 Returns:
129 Tuple of (destination_format, source_ext, dest_ext)
130 Returns (None, None, None) if validation fails
131 """
132 try:
133 destination_format = MEDIA_TYPES.get_conversion_target(
134 source_format)
135 except (AttributeError, NameError):
136 destination_format = None
137
138 if destination_format is None:
139 self.disp.log_debug(f"No conversion target for {source_format}")
140 return None, None, None
141
142 if source_format == destination_format:
143 self.disp.log_debug(
144 f"Source and destination formats are the same: {source_format}"
145 )
146 return destination_format, None, None
147
148 source_ext = self.get_document_extension(source_format)
149 dest_ext = self.get_document_extension(destination_format)
150
151 if not source_ext or not dest_ext:
152 self.disp.log_warning(
153 f"Unknown document extension for {source_format} -> {destination_format}"
154 )
155 return destination_format, None, None
156
157 return destination_format, source_ext, dest_ext
158
160 self,
161 data: bytes,
162 source_ext: str,
163 dest_ext: str
164 ) -> tuple[Path, Path]:
165 """
166 Create temporary files for document conversion.
167 WARNING: This incurs billable I/O operations. Only used when in-memory conversion is not possible.
168
169 Args:
170 data: The source document data
171 source_ext: The source file extension
172 dest_ext: The destination file extension
173
174 Returns:
175 Tuple of (source_path, destination_path)
176 """
177 self.disp.log_warning(
178 "Using file-based conversion - this incurs I/O costs")
179
180 with tempfile.NamedTemporaryFile(
181 suffix=f".{source_ext}",
182 delete=False
183 ) as src_file:
184 src_file.write(data)
185 src_path = Path(src_file.name)
186
187 with tempfile.NamedTemporaryFile(
188 suffix=f".{dest_ext}",
189 delete=False
190 ) as dst_file:
191 dst_path = Path(dst_file.name)
192
193 return src_path, dst_path
194
196 self,
197 data: bytes,
198 source_format: DataTypes,
199 destination_format: DataTypes
200 ) -> Optional[bytes]:
201 """
202 Perform in-memory document conversion for text-based formats.
203 ZERO I/O COST - everything happens in RAM, no disk writes.
204
205 Args:
206 data: Source document data
207 source_format: Source document format
208 destination_format: Destination document format
209
210 Returns:
211 Converted document data as bytes, or None if conversion failed
212 """
213 self.disp.log_debug(
214 f"Converting in-memory (ZERO I/O): {source_format} -> {destination_format}"
215 )
216
217 try:
218 # Decode bytes to string for text-based conversion
219 text_data = data.decode('utf-8', errors='replace')
220
221 # Use pypandoc.convert_text for pure in-memory conversion
222 converted_text = pypandoc.convert_text(
223 text_data,
224 to=destination_format.name.lower(),
225 format=source_format.name.lower(),
226 extra_args=['--standalone']
227 )
228
229 # Convert back to bytes
230 if isinstance(converted_text, str):
231 result = converted_text.encode('utf-8')
232 else:
233 result = converted_text
234
235 self.disp.log_debug(
236 "In-memory conversion successful (ZERO I/O cost)")
237 return result
238
239 except Exception as e:
240 self.disp.log_error(f"In-memory conversion failed: {e}")
241 return None
242
244 self,
245 src_path: Path,
246 dst_path: Path,
247 source_format: DataTypes,
248 destination_format: DataTypes
249 ) -> Optional[bytes]:
250 """
251 Perform the actual document conversion using Pandoc with file I/O.
252 WARNING: This incurs billable I/O operations.
253
254 Args:
255 src_path: Path to source document file
256 dst_path: Path to destination document file
257 source_format: Source document format
258 destination_format: Destination document format
259
260 Returns:
261 Converted document data as bytes, or None if conversion failed
262 """
263 self.disp.log_debug(
264 f"Converting document (file-based, incurs I/O): {source_format} -> {destination_format}"
265 )
266
267 try:
268 # Use pypandoc to convert
269 pypandoc.convert_file(
270 str(src_path),
271 to=destination_format.name.lower(),
272 outputfile=str(dst_path),
273 extra_args=['--standalone']
274 )
275
276 with open(dst_path, 'rb') as f:
277 converted_data = f.read()
278
279 self.disp.log_debug("Document conversion successful")
280 return converted_data
281
282 except Exception as e:
283 self.disp.log_error(f"Pandoc conversion failed: {e}")
284 return None
285
286 @staticmethod
287 def _cleanup_temp_files(src_path: Path, dst_path: Path) -> None:
288 """
289 Clean up temporary files.
290
291 Args:
292 src_path: Source file path to delete
293 dst_path: Destination file path to delete
294 """
295 if src_path.exists():
296 src_path.unlink()
297 if dst_path.exists():
298 dst_path.unlink()
299
301 self,
302 data: bytes,
303 source_format: DataTypes,
304 destination_format: DataTypes,
305 result: Optional[bytes] = None
306 ) -> CONV_CONST.ConversionResult:
307 """
308 Create a failed conversion result.
309
310 Args:
311 data: Original document data
312 source_format: Source document format
313 destination_format: Destination document format
314 result: Optional result data
315
316 Returns:
317 ConversionResult indicating failure
318 """
319 return CONV_CONST.ConversionResult(
320 data=data,
321 converted=False,
322 from_type=source_format,
323 to_type=destination_format,
324 result=result
325 )
326
328 self,
329 data: bytes,
330 source_format: DataTypes,
331 destination_format: DataTypes,
332 converted_data: bytes
333 ) -> CONV_CONST.ConversionResult:
334 """
335 Create a successful conversion result.
336
337 Args:
338 data: Original document data
339 source_format: Source document format
340 destination_format: Destination document format
341 converted_data: Converted document data
342
343 Returns:
344 ConversionResult indicating success
345 """
346 return CONV_CONST.ConversionResult(
347 data=data,
348 converted=True,
349 from_type=source_format,
350 to_type=destination_format,
351 result=converted_data
352 )
353
355 self,
356 data: bytes,
357 source_format: DataTypes,
358 destination_format: Optional[DataTypes],
359 source_ext: Optional[str],
360 dest_ext: Optional[str]
361 ) -> Optional[CONV_CONST.ConversionResult]:
362 """
363 Handle validation failures and return appropriate result.
364
365 Args:
366 data: Original document data
367 source_format: Source document format
368 destination_format: Destination format (may be None)
369 source_ext: Source extension (may be None)
370 dest_ext: Destination extension (may be None)
371
372 Returns:
373 ConversionResult if validation failed, None if validation passed
374 """
375 if destination_format is None:
376 return self._create_failed_result(
377 data, source_format, source_format, None
378 )
379
380 if source_ext is None or dest_ext is None:
381 result_data = data if source_format == destination_format else data
382 return self._create_failed_result(
383 data, source_format, destination_format, result_data
384 )
385
386 return None
387
389 self,
390 data: bytes,
391 source_format: DataTypes,
392 destination_format: DataTypes,
393 source_ext: str,
394 dest_ext: str
395 ) -> CONV_CONST.ConversionResult:
396 """
397 Perform conversion, preferring in-memory (ZERO I/O cost) when possible.
398 Falls back to file-based conversion (billable I/O) only for binary formats.
399
400 Args:
401 data: Original document data
402 source_format: Source document format
403 destination_format: Destination document format
404 source_ext: Source file extension
405 dest_ext: Destination file extension
406
407 Returns:
408 ConversionResult with conversion outcome
409 """
410 # Try in-memory conversion for text-based formats (ZERO I/O cost)
411 if self._can_use_memory_conversion(source_format, destination_format):
412 try:
413 converted_data = self._convert_in_memory(
414 data, source_format, destination_format
415 )
416
417 if converted_data is not None:
418 return self._create_success_result(
419 data, source_format, destination_format, converted_data
420 )
421 else:
422 self.disp.log_warning(
423 "In-memory conversion failed, falling back to file-based (will incur I/O cost)"
424 )
425 except Exception as e:
426 self.disp.log_warning(
427 f"In-memory conversion error: {e}, falling back to file-based (will incur I/O cost)"
428 )
429 else:
430 self.disp.log_info(
431 f"Binary format conversion {source_format} -> {destination_format} requires file I/O (billable)"
432 )
433
434 # Fall back to file-based conversion (incurs billable I/O)
435 src_path: Optional[Path] = None
436 dst_path: Optional[Path] = None
437
438 try:
439 src_path, dst_path = self._create_temp_files(
440 data, source_ext, dest_ext
441 )
442
443 converted_data = self._perform_conversion(
444 src_path, dst_path, source_format, destination_format
445 )
446
447 if converted_data is None:
448 return self._create_failed_result(
449 data, source_format, destination_format, None
450 )
451
452 return self._create_success_result(
453 data, source_format, destination_format, converted_data
454 )
455
456 except Exception as e:
457 self.disp.log_error(f"Document conversion error: {e}")
458 return self._create_failed_result(
459 data, source_format, destination_format, None
460 )
461
462 finally:
463 if src_path is not None and dst_path is not None:
464 self._cleanup_temp_files(src_path, dst_path)
465
466 def document_to_document(self, data: bytes, source_format: DataTypes) -> CONV_CONST.ConversionResult:
467 """
468 Convert document data from one format to another using Pandoc.
469
470 Args:
471 data (bytes): The document data to convert.
472 source_format (DataTypes): The original document format.
473
474 Returns:
475 ConversionResult: The converted document data in a contained dataclass.
476 """
477 destination_format, source_ext, dest_ext = self._validate_conversion_params(
478 data, source_format
479 )
480
481 validation_result = self._handle_validation_failure(
482 data, source_format, destination_format, source_ext, dest_ext
483 )
484 if validation_result is not None:
485 return validation_result
486
487 # Type guard: at this point we know these are not None due to validation
488 if destination_format is None or source_ext is None or dest_ext is None:
489 # This should never happen after validation, but satisfies type checker
490 return self._create_failed_result(
491 data, source_format, source_format, None
492 )
493 return self._convert_with_temp_files(
494 data, source_format, destination_format, source_ext, dest_ext
495 )
CONV_CONST.ConversionResult _create_failed_result(self, bytes data, DataTypes source_format, DataTypes destination_format, Optional[bytes] result=None)
tuple[Path, Path] _create_temp_files(self, bytes data, str source_ext, str dest_ext)
Optional[CONV_CONST.ConversionResult] _handle_validation_failure(self, bytes data, DataTypes source_format, Optional[DataTypes] destination_format, Optional[str] source_ext, Optional[str] dest_ext)
CONV_CONST.ConversionResult __call__(self, bytes data, DataTypes source_format)
CONV_CONST.ConversionResult _convert_with_temp_files(self, bytes data, DataTypes source_format, DataTypes destination_format, str source_ext, str dest_ext)
CONV_CONST.ConversionResult document_to_document(self, bytes data, DataTypes source_format)
Optional[bytes] _perform_conversion(self, Path src_path, Path dst_path, DataTypes source_format, DataTypes destination_format)
CONV_CONST.ConversionResult _create_success_result(self, bytes data, DataTypes source_format, DataTypes destination_format, bytes converted_data)
tuple[Optional[DataTypes], Optional[str], Optional[str]] _validate_conversion_params(self, bytes data, DataTypes source_format)
Optional[bytes] _convert_in_memory(self, bytes data, DataTypes source_format, DataTypes destination_format)