TalkEdit/utils/ollama_integration.py

"""
Ollama integration for local AI model inference.
Provides functions to use Ollama's API for text summarization with streaming support.
"""

import requests
import json
import logging
from pathlib import Path
import os

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

OLLAMA_API_URL = os.environ.get("OLLAMA_API_URL", "http://localhost:11434/api")


def check_ollama_available():
    """Check if Ollama service is available."""
    try:
        response = requests.get(f"{OLLAMA_API_URL}/tags", timeout=2)
        return response.status_code == 200
    except requests.exceptions.RequestException:
        return False


def list_available_models():
    """List available models in Ollama."""
    try:
        response = requests.get(f"{OLLAMA_API_URL}/tags")
        if response.status_code == 200:
            models = response.json().get('models', [])
            return [model['name'] for model in models]
        return []
    except requests.exceptions.RequestException as e:
        logger.error(f"Error listing Ollama models: {e}")
        return []


def summarize_with_ollama(text, model="llama3", max_length=150):
    """Summarize text using Ollama's local API (non-streaming)."""
    if not check_ollama_available():
        logger.warning("Ollama service is not available")
        return None
    
    prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
    
    try:
        response = requests.post(
            f"{OLLAMA_API_URL}/generate",
            json={
                "model": model,
                "prompt": prompt,
                "stream": False,
                "options": {
                    "temperature": 0.3,
                    "top_p": 0.9,
                    "max_tokens": max_length * 2
                }
            }
        )
        
        if response.status_code == 200:
            result = response.json()
            return result.get('response', '').strip()
        else:
            logger.error(f"Ollama API error: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        logger.error(f"Error communicating with Ollama: {e}")
        return None


def stream_summarize_with_ollama(text, model="llama3", max_length=150):
    """
    Summarize text using Ollama with streaming. Yields tokens as they arrive.
    
    Yields:
        str: Individual response tokens
    """
    if not check_ollama_available():
        logger.warning("Ollama service is not available")
        return
    
    prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"
    
    try:
        response = requests.post(
            f"{OLLAMA_API_URL}/generate",
            json={
                "model": model,
                "prompt": prompt,
                "stream": True,
                "options": {
                    "temperature": 0.3,
                    "top_p": 0.9,
                    "max_tokens": max_length * 2
                }
            },
            stream=True
        )
        
        if response.status_code == 200:
            for line in response.iter_lines():
                if line:
                    data = json.loads(line)
                    token = data.get('response', '')
                    if token:
                        yield token
                    if data.get('done', False):
                        break
        else:
            logger.error(f"Ollama API error: {response.status_code}")
    except requests.exceptions.RequestException as e:
        logger.error(f"Error communicating with Ollama: {e}")


def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
    """Chunk long text and summarize each chunk, then combine."""
    if len(text) <= chunk_size:
        return summarize_with_ollama(text, model, max_length)
    
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        if current_length + len(word) + 1 <= chunk_size:
            current_chunk.append(word)
            current_length += len(word) + 1
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
        summary = summarize_with_ollama(chunk, model, max_length // len(chunks))
        if summary:
            chunk_summaries.append(summary)
    
    if not chunk_summaries:
        return None
    
    if len(chunk_summaries) == 1:
        return chunk_summaries[0]
    
    combined_summary = " ".join(chunk_summaries)
    return summarize_with_ollama(combined_summary, model, max_length)


def stream_chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):
    """
    Chunk and summarize with streaming on the final summary.
    Returns non-streaming chunk summaries, then streams the final combination.
    
    Yields:
        str: Tokens from the final summary
    """
    if len(text) <= chunk_size:
        yield from stream_summarize_with_ollama(text, model, max_length)
        return
    
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        if current_length + len(word) + 1 <= chunk_size:
            current_chunk.append(word)
            current_length += len(word) + 1
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
        summary = summarize_with_ollama(chunk, model, max_length // len(chunks))
        if summary:
            chunk_summaries.append(summary)
    
    if not chunk_summaries:
        return
    
    if len(chunk_summaries) == 1:
        yield chunk_summaries[0]
        return
    
    combined_summary = " ".join(chunk_summaries)
    yield from stream_summarize_with_ollama(combined_summary, model, max_length)
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`"""`
			`Ollama integration for local AI model inference.`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`Provides functions to use Ollama's API for text summarization with streaming support.`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`"""`

			`import requests`
			`import json`
			`import logging`
			`from pathlib import Path`
			`import os`

			`logging.basicConfig(level=logging.INFO)`
			`logger = logging.getLogger(__name__)`

Enhance README.md with Docker installation instructions and update Ollama API endpoint to be configurable via environment variable. 2025-07-17 00:05:23 -04:00			`OLLAMA_API_URL = os.environ.get("OLLAMA_API_URL", "http://localhost:11434/api")`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00

			`def check_ollama_available():`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`"""Check if Ollama service is available."""`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`try:`
			`response = requests.get(f"{OLLAMA_API_URL}/tags", timeout=2)`
			`return response.status_code == 200`
			`except requests.exceptions.RequestException:`
			`return False`


			`def list_available_models():`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`"""List available models in Ollama."""`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`try:`
			`response = requests.get(f"{OLLAMA_API_URL}/tags")`
			`if response.status_code == 200:`
			`models = response.json().get('models', [])`
			`return [model['name'] for model in models]`
			`return []`
			`except requests.exceptions.RequestException as e:`
			`logger.error(f"Error listing Ollama models: {e}")`
			`return []`


			`def summarize_with_ollama(text, model="llama3", max_length=150):`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`"""Summarize text using Ollama's local API (non-streaming)."""`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`if not check_ollama_available():`
			`logger.warning("Ollama service is not available")`
			`return None`

			`prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"`

			`try:`
			`response = requests.post(`
			`f"{OLLAMA_API_URL}/generate",`
			`json={`
			`"model": model,`
			`"prompt": prompt,`
			`"stream": False,`
			`"options": {`
			`"temperature": 0.3,`
			`"top_p": 0.9,`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`"max_tokens": max_length * 2`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`}`
			`}`
			`)`

			`if response.status_code == 200:`
			`result = response.json()`
			`return result.get('response', '').strip()`
			`else:`
			`logger.error(f"Ollama API error: {response.status_code} - {response.text}")`
			`return None`
			`except requests.exceptions.RequestException as e:`
			`logger.error(f"Error communicating with Ollama: {e}")`
			`return None`


feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`def stream_summarize_with_ollama(text, model="llama3", max_length=150):`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`"""`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`Summarize text using Ollama with streaming. Yields tokens as they arrive.`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`Yields:`
			`str: Individual response tokens`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`"""`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`if not check_ollama_available():`
			`logger.warning("Ollama service is not available")`
			`return`

			`prompt = f"Summarize the following text in about {max_length} words:\n\n{text}"`

			`try:`
			`response = requests.post(`
			`f"{OLLAMA_API_URL}/generate",`
			`json={`
			`"model": model,`
			`"prompt": prompt,`
			`"stream": True,`
			`"options": {`
			`"temperature": 0.3,`
			`"top_p": 0.9,`
			`"max_tokens": max_length * 2`
			`}`
			`},`
			`stream=True`
			`)`

			`if response.status_code == 200:`
			`for line in response.iter_lines():`
			`if line:`
			`data = json.loads(line)`
			`token = data.get('response', '')`
			`if token:`
			`yield token`
			`if data.get('done', False):`
			`break`
			`else:`
			`logger.error(f"Ollama API error: {response.status_code}")`
			`except requests.exceptions.RequestException as e:`
			`logger.error(f"Error communicating with Ollama: {e}")`


			`def chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):`
			`"""Chunk long text and summarize each chunk, then combine."""`
Add installation scripts and update documentation for Phase 3 features 2025-03-01 20:37:52 -05:00			`if len(text) <= chunk_size:`
			`return summarize_with_ollama(text, model, max_length)`

			`words = text.split()`
			`chunks = []`
			`current_chunk = []`
			`current_length = 0`

			`for word in words:`
			`if current_length + len(word) + 1 <= chunk_size:`
			`current_chunk.append(word)`
			`current_length += len(word) + 1`
			`else:`
			`chunks.append(' '.join(current_chunk))`
			`current_chunk = [word]`
			`current_length = len(word) + 1`

			`if current_chunk:`
			`chunks.append(' '.join(current_chunk))`

			`chunk_summaries = []`
			`for i, chunk in enumerate(chunks):`
			`logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")`
			`summary = summarize_with_ollama(chunk, model, max_length // len(chunks))`
			`if summary:`
			`chunk_summaries.append(summary)`

			`if not chunk_summaries:`
			`return None`

			`if len(chunk_summaries) == 1:`
			`return chunk_summaries[0]`

			`combined_summary = " ".join(chunk_summaries)`
feat: Add streaming Ollama support, model caching, and UI improvements - Add streaming summarization via Ollama API (stream_summarize_with_ollama) - Cache ML models with @st.cache_resource (diarization, NER, translation, Whisper) - Add temp file cleanup for extracted audio - Add system capabilities detection (FFmpeg, GPU info) - Add get_video_duration utility - Improve validation with FFmpeg check - Rewrite app.py with streaming support and UI enhancements - Clean up redundant comments and unused imports across all utils 2026-02-18 10:26:09 -05:00			`return summarize_with_ollama(combined_summary, model, max_length)`


			`def stream_chunk_and_summarize(text, model="llama3", chunk_size=4000, max_length=150):`
			`"""`
			`Chunk and summarize with streaming on the final summary.`
			`Returns non-streaming chunk summaries, then streams the final combination.`

			`Yields:`
			`str: Tokens from the final summary`
			`"""`
			`if len(text) <= chunk_size:`
			`yield from stream_summarize_with_ollama(text, model, max_length)`
			`return`

			`words = text.split()`
			`chunks = []`
			`current_chunk = []`
			`current_length = 0`

			`for word in words:`
			`if current_length + len(word) + 1 <= chunk_size:`
			`current_chunk.append(word)`
			`current_length += len(word) + 1`
			`else:`
			`chunks.append(' '.join(current_chunk))`
			`current_chunk = [word]`
			`current_length = len(word) + 1`

			`if current_chunk:`
			`chunks.append(' '.join(current_chunk))`

			`chunk_summaries = []`
			`for i, chunk in enumerate(chunks):`
			`logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")`
			`summary = summarize_with_ollama(chunk, model, max_length // len(chunks))`
			`if summary:`
			`chunk_summaries.append(summary)`

			`if not chunk_summaries:`
			`return`

			`if len(chunk_summaries) == 1:`
			`yield chunk_summaries[0]`
			`return`

			`combined_summary = " ".join(chunk_summaries)`
			`yield from stream_summarize_with_ollama(combined_summary, model, max_length)`