""" Caching utilities for the OBS Recording Transcriber. Provides functions to cache and retrieve transcription and summarization results. """ import json import hashlib import os from pathlib import Path import logging import time # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Default cache directory CACHE_DIR = Path.home() / ".obs_transcriber_cache" def get_file_hash(file_path): """ Generate a hash for a file based on its content and modification time. Args: file_path (Path): Path to the file Returns: str: Hash string representing the file """ file_path = Path(file_path) if not file_path.exists(): return None # Get file stats stats = file_path.stat() file_size = stats.st_size mod_time = stats.st_mtime # Create a hash based on path, size and modification time # This is faster than hashing the entire file content hash_input = f"{file_path.absolute()}|{file_size}|{mod_time}" return hashlib.md5(hash_input.encode()).hexdigest() def get_cache_path(file_path, model=None, operation=None): """ Get the cache file path for a given input file and operation. Args: file_path (Path): Path to the original file model (str, optional): Model used for processing operation (str, optional): Operation type (e.g., 'transcribe', 'summarize') Returns: Path: Path to the cache file """ file_path = Path(file_path) file_hash = get_file_hash(file_path) if not file_hash: return None # Create cache directory if it doesn't exist cache_dir = CACHE_DIR cache_dir.mkdir(parents=True, exist_ok=True) # Create a cache filename based on the hash and optional parameters cache_name = file_hash if model: cache_name += f"_{model}" if operation: cache_name += f"_{operation}" return cache_dir / f"{cache_name}.json" def save_to_cache(file_path, data, model=None, operation=None): """ Save data to cache. Args: file_path (Path): Path to the original file data (dict): Data to cache model (str, optional): Model used for processing operation (str, optional): Operation type Returns: bool: True if successful, False otherwise """ cache_path = get_cache_path(file_path, model, operation) if not cache_path: return False try: # Add metadata to the cached data cache_data = { "original_file": str(Path(file_path).absolute()), "timestamp": time.time(), "model": model, "operation": operation, "data": data } with open(cache_path, 'w', encoding='utf-8') as f: json.dump(cache_data, f, ensure_ascii=False, indent=2) logger.info(f"Cached data saved to {cache_path}") return True except Exception as e: logger.error(f"Error saving cache: {e}") return False def load_from_cache(file_path, model=None, operation=None, max_age=None): """ Load data from cache if available and not expired. Args: file_path (Path): Path to the original file model (str, optional): Model used for processing operation (str, optional): Operation type max_age (float, optional): Maximum age of cache in seconds Returns: dict or None: Cached data or None if not available """ cache_path = get_cache_path(file_path, model, operation) if not cache_path or not cache_path.exists(): return None try: with open(cache_path, 'r', encoding='utf-8') as f: cache_data = json.load(f) # Check if cache is expired if max_age is not None: cache_time = cache_data.get("timestamp", 0) if time.time() - cache_time > max_age: logger.info(f"Cache expired for {file_path}") return None logger.info(f"Loaded data from cache: {cache_path}") return cache_data.get("data") except Exception as e: logger.error(f"Error loading cache: {e}") return None def clear_cache(max_age=None): """ Clear all cache files or only expired ones. Args: max_age (float, optional): Maximum age of cache in seconds Returns: int: Number of files deleted """ if not CACHE_DIR.exists(): return 0 count = 0 for cache_file in CACHE_DIR.glob("*.json"): try: if max_age is not None: # Check if file is expired with open(cache_file, 'r', encoding='utf-8') as f: cache_data = json.load(f) cache_time = cache_data.get("timestamp", 0) if time.time() - cache_time <= max_age: continue # Skip non-expired files # Delete the file os.remove(cache_file) count += 1 except Exception as e: logger.error(f"Error deleting cache file {cache_file}: {e}") logger.info(f"Cleared {count} cache files") return count def get_cache_size(): """ Get the total size of the cache directory. Returns: tuple: (size_bytes, file_count) """ if not CACHE_DIR.exists(): return 0, 0 total_size = 0 file_count = 0 for cache_file in CACHE_DIR.glob("*.json"): try: total_size += cache_file.stat().st_size file_count += 1 except Exception: pass return total_size, file_count