2# +==== BEGIN CatFeeder =================+
5# ...............)..(.')
7# ...............\(__)|
8# Inspired by Joan Stark
9# source https://www.asciiart.eu/
13# FILE: document_to_document.py
14# CREATION DATE: 15-01-2026
15# LAST Modified: 1:32:47 17-01-2026
17# This is the backend server in charge of making the actual website work.
19# COPYRIGHT: (c) Cat Feeder
20# PURPOSE: The file containing the code for converting documents from one format to another.
22# +==== END CatFeeder =================+
26from typing
import Optional, Set
27from pathlib
import Path
30from display_tty
import Disp, initialise_logger
35from .aliases
import DOCUMENT_FORMAT_ALIASES
36from .
import converters_constants
as CONV_CONST
38from ..http_constants
import DataTypes, MEDIA_TYPES
40from ...core
import FinalClass
41from ...utils
import CONST
42from ...tinytex
import TinyTeXInstaller
46 """Class used to convert documents from one format to another using Pandoc.
48 Optimized to minimize I/O costs:
49 - Text-based formats (MD, HTML, JSON, etc.) converted entirely in-memory (zero I/O)
50 - Binary formats (PDF, DOCX, etc.) require file I/O (billable operations)
53 disp: Disp = initialise_logger(__qualname__, CONST.DEBUG)
55 _instance: Optional[
"DocumentToDocument"] =
None
57 _downloaded_tinytex: bool =
False
60 _TEXT_BASED_FORMATS: Set[DataTypes] = {
61 DataTypes.TXT, DataTypes.TEXT, DataTypes.PLAIN,
62 DataTypes.MARKDOWN, DataTypes.MD,
63 DataTypes.HTML, DataTypes.XML, DataTypes.XHTML,
64 DataTypes.JSON, DataTypes.YAML, DataTypes.YML,
65 DataTypes.TOML, DataTypes.CSV,
66 DataTypes.RSS, DataTypes.ATOM,
75 self.
disp.log_debug(
"Initialising...")
76 self.
disp.log_debug(
"Checking for TinyTeX dependencies...")
78 self.
disp.log_debug(
"TinyTeX dependencies satisfied.")
79 self.
disp.log_debug(
"Initialised.")
81 def __call__(self, data: bytes, source_format: DataTypes) -> CONV_CONST.ConversionResult:
85 """Ensure TinyTeX is installed for LaTeX/PDF operations."""
87 self.
disp.log_debug(
"Installing TinyTeX...")
91 self.
disp.log_debug(f
"TinyTeX installed at: {path}")
92 except Exception
as e:
93 self.
disp.log_warning(f
"TinyTeX installation failed: {e}")
98 Get the file extension for a given document DataType.
101 data_type: The DataType to get extension for
104 Extension string or None
106 return DOCUMENT_FORMAT_ALIASES.get(data_type)
109 """Check if a format is text-based and can use in-memory conversion (zero I/O)."""
113 """Check if conversion can be done entirely in memory (zero I/O cost)."""
119 source_format: DataTypes
120 ) -> tuple[Optional[DataTypes], Optional[str], Optional[str]]:
122 Validate conversion parameters and get destination format and extensions.
125 data: The document data to validate
126 source_format: The source document format
129 Tuple of (destination_format, source_ext, dest_ext)
130 Returns (None, None, None) if validation fails
133 destination_format = MEDIA_TYPES.get_conversion_target(
135 except (AttributeError, NameError):
136 destination_format =
None
138 if destination_format
is None:
139 self.
disp.log_debug(f
"No conversion target for {source_format}")
140 return None,
None,
None
142 if source_format == destination_format:
144 f
"Source and destination formats are the same: {source_format}"
146 return destination_format,
None,
None
151 if not source_ext
or not dest_ext:
152 self.
disp.log_warning(
153 f
"Unknown document extension for {source_format} -> {destination_format}"
155 return destination_format,
None,
None
157 return destination_format, source_ext, dest_ext
164 ) -> tuple[Path, Path]:
166 Create temporary files for document conversion.
167 WARNING: This incurs billable I/O operations. Only used when in-memory conversion is not possible.
170 data: The source document data
171 source_ext: The source file extension
172 dest_ext: The destination file extension
175 Tuple of (source_path, destination_path)
177 self.
disp.log_warning(
178 "Using file-based conversion - this incurs I/O costs")
180 with tempfile.NamedTemporaryFile(
181 suffix=f
".{source_ext}",
185 src_path = Path(src_file.name)
187 with tempfile.NamedTemporaryFile(
188 suffix=f
".{dest_ext}",
191 dst_path = Path(dst_file.name)
193 return src_path, dst_path
198 source_format: DataTypes,
199 destination_format: DataTypes
200 ) -> Optional[bytes]:
202 Perform in-memory document conversion for text-based formats.
203 ZERO I/O COST - everything happens in RAM, no disk writes.
206 data: Source document data
207 source_format: Source document format
208 destination_format: Destination document format
211 Converted document data as bytes, or None if conversion failed
214 f
"Converting in-memory (ZERO I/O): {source_format} -> {destination_format}"
219 text_data = data.decode(
'utf-8', errors=
'replace')
222 converted_text = pypandoc.convert_text(
224 to=destination_format.name.lower(),
225 format=source_format.name.lower(),
226 extra_args=[
'--standalone']
230 if isinstance(converted_text, str):
231 result = converted_text.encode(
'utf-8')
233 result = converted_text
236 "In-memory conversion successful (ZERO I/O cost)")
239 except Exception
as e:
240 self.
disp.log_error(f
"In-memory conversion failed: {e}")
247 source_format: DataTypes,
248 destination_format: DataTypes
249 ) -> Optional[bytes]:
251 Perform the actual document conversion using Pandoc with file I/O.
252 WARNING: This incurs billable I/O operations.
255 src_path: Path to source document file
256 dst_path: Path to destination document file
257 source_format: Source document format
258 destination_format: Destination document format
261 Converted document data as bytes, or None if conversion failed
264 f
"Converting document (file-based, incurs I/O): {source_format} -> {destination_format}"
269 pypandoc.convert_file(
271 to=destination_format.name.lower(),
272 outputfile=str(dst_path),
273 extra_args=[
'--standalone']
276 with open(dst_path,
'rb')
as f:
277 converted_data = f.read()
279 self.
disp.log_debug(
"Document conversion successful")
280 return converted_data
282 except Exception
as e:
283 self.
disp.log_error(f
"Pandoc conversion failed: {e}")
289 Clean up temporary files.
292 src_path: Source file path to delete
293 dst_path: Destination file path to delete
295 if src_path.exists():
297 if dst_path.exists():
303 source_format: DataTypes,
304 destination_format: DataTypes,
305 result: Optional[bytes] =
None
306 ) -> CONV_CONST.ConversionResult:
308 Create a failed conversion result.
311 data: Original document data
312 source_format: Source document format
313 destination_format: Destination document format
314 result: Optional result data
317 ConversionResult indicating failure
319 return CONV_CONST.ConversionResult(
322 from_type=source_format,
323 to_type=destination_format,
330 source_format: DataTypes,
331 destination_format: DataTypes,
332 converted_data: bytes
333 ) -> CONV_CONST.ConversionResult:
335 Create a successful conversion result.
338 data: Original document data
339 source_format: Source document format
340 destination_format: Destination document format
341 converted_data: Converted document data
344 ConversionResult indicating success
346 return CONV_CONST.ConversionResult(
349 from_type=source_format,
350 to_type=destination_format,
351 result=converted_data
357 source_format: DataTypes,
358 destination_format: Optional[DataTypes],
359 source_ext: Optional[str],
360 dest_ext: Optional[str]
361 ) -> Optional[CONV_CONST.ConversionResult]:
363 Handle validation failures and return appropriate result.
366 data: Original document data
367 source_format: Source document format
368 destination_format: Destination format (may be None)
369 source_ext: Source extension (may be None)
370 dest_ext: Destination extension (may be None)
373 ConversionResult if validation failed, None if validation passed
375 if destination_format
is None:
377 data, source_format, source_format,
None
380 if source_ext
is None or dest_ext
is None:
381 result_data = data
if source_format == destination_format
else data
383 data, source_format, destination_format, result_data
391 source_format: DataTypes,
392 destination_format: DataTypes,
395 ) -> CONV_CONST.ConversionResult:
397 Perform conversion, preferring in-memory (ZERO I/O cost) when possible.
398 Falls back to file-based conversion (billable I/O) only for binary formats.
401 data: Original document data
402 source_format: Source document format
403 destination_format: Destination document format
404 source_ext: Source file extension
405 dest_ext: Destination file extension
408 ConversionResult with conversion outcome
414 data, source_format, destination_format
417 if converted_data
is not None:
419 data, source_format, destination_format, converted_data
422 self.
disp.log_warning(
423 "In-memory conversion failed, falling back to file-based (will incur I/O cost)"
425 except Exception
as e:
426 self.
disp.log_warning(
427 f
"In-memory conversion error: {e}, falling back to file-based (will incur I/O cost)"
431 f
"Binary format conversion {source_format} -> {destination_format} requires file I/O (billable)"
435 src_path: Optional[Path] =
None
436 dst_path: Optional[Path] =
None
440 data, source_ext, dest_ext
444 src_path, dst_path, source_format, destination_format
447 if converted_data
is None:
449 data, source_format, destination_format,
None
453 data, source_format, destination_format, converted_data
456 except Exception
as e:
457 self.
disp.log_error(f
"Document conversion error: {e}")
459 data, source_format, destination_format,
None
463 if src_path
is not None and dst_path
is not None:
466 def document_to_document(self, data: bytes, source_format: DataTypes) -> CONV_CONST.ConversionResult:
468 Convert document data from one format to another using Pandoc.
471 data (bytes): The document data to convert.
472 source_format (DataTypes): The original document format.
475 ConversionResult: The converted document data in a contained dataclass.
482 data, source_format, destination_format, source_ext, dest_ext
484 if validation_result
is not None:
485 return validation_result
488 if destination_format
is None or source_ext
is None or dest_ext
is None:
491 data, source_format, source_format,
None
494 data, source_format, destination_format, source_ext, dest_ext
CONV_CONST.ConversionResult _create_failed_result(self, bytes data, DataTypes source_format, DataTypes destination_format, Optional[bytes] result=None)
None _cleanup_temp_files(Path src_path, Path dst_path)
tuple[Path, Path] _create_temp_files(self, bytes data, str source_ext, str dest_ext)
None _ensure_tinytex(self)
Optional[str] get_document_extension(self, DataTypes data_type)
"DocumentToDocument" __new__(cls)
Optional[CONV_CONST.ConversionResult] _handle_validation_failure(self, bytes data, DataTypes source_format, Optional[DataTypes] destination_format, Optional[str] source_ext, Optional[str] dest_ext)
CONV_CONST.ConversionResult __call__(self, bytes data, DataTypes source_format)
bool _is_text_based(self, DataTypes data_type)
bool _can_use_memory_conversion(self, DataTypes source, DataTypes dest)
CONV_CONST.ConversionResult _convert_with_temp_files(self, bytes data, DataTypes source_format, DataTypes destination_format, str source_ext, str dest_ext)
CONV_CONST.ConversionResult document_to_document(self, bytes data, DataTypes source_format)
Optional[bytes] _perform_conversion(self, Path src_path, Path dst_path, DataTypes source_format, DataTypes destination_format)
CONV_CONST.ConversionResult _create_success_result(self, bytes data, DataTypes source_format, DataTypes destination_format, bytes converted_data)
tuple[Optional[DataTypes], Optional[str], Optional[str]] _validate_conversion_params(self, bytes data, DataTypes source_format)
Optional[bytes] _convert_in_memory(self, bytes data, DataTypes source_format, DataTypes destination_format)