Spaces:

macsunmood
/

conspectum

Sleeping

App Files Files Community

macsunmood commited on 14 days ago

Commit

e4e56ea

1 Parent(s): d214c78

Init app

Browse files

Files changed (13) hide show

.streamlit/config.toml +11 -0
README.md +3 -3
app.py +33 -0
requirements.txt +30 -0
transcriber.py +68 -0
ui_create_summary.py +0 -0
ui_home.py +8 -0
ui_result.py +0 -0
ui_summarize.py +338 -0
ui_transcribe.py +284 -0
ui_upload.py +320 -0
ui_video.py +216 -0
utils.py +436 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,11 @@

+[client]
+showErrorDetails = true
+[server]
+headless = false
+enableCORS = false
+enableXsrfProtection = false
+maxUploadSize = 5000
+[theme]
+primaryColor = "#FA8E00"

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Conspectum
-emoji: 👀
-colorFrom: pink
-colorTo: pink
 sdk: streamlit
 sdk_version: 1.44.1
 app_file: app.py

 ---
 title: Conspectum
+emoji: 📚
+colorFrom: yellow
+colorTo: yellow
 sdk: streamlit
 sdk_version: 1.44.1
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import streamlit as st
+### ICON BANK: 🗣️🎙️🎤🗨 📚📝🎞️👩‍🏫👨‍🏫💡📖 🗒️🔑💾
+# Page config
+st.set_page_config(
+    page_title='Conspectum: Video Lectures Summarization',
+    # page_icon='conspectum_logo.png',
+    page_icon='📚',
+    layout='wide',
+    menu_items={
+        'Get help': 'https://edu.olymponline.ru/',
+        'About': "# MIPT Master's :: Hackathon - Spring '25. Team 8 - Conspectum"
+    },
+)
+pg = st.navigation({
+    'Home':
+    [
+        # ui_home := st.Page('ui_home.py', title='Welcome', icon='🏠'),
+        ui_upload := st.Page('ui_upload.py', title='Upload', icon='📥'),
+        ui_create_summary := st.Page('ui_create_summary.py', title='Create Summary', icon='✨')
+    ],
+    'Pipeline Sandbox':
+    [
+        ui_transcribe := st.Page('ui_transcribe.py', title='Transcribe', icon='🎙️'),
+        ui_video := st.Page('ui_video.py', title='Analyse Video', icon='🖼️'),
+        ui_summarize := st.Page('ui_summarize.py', title='Summarize', icon='📝'),
+        ui_result := st.Page('ui_result.py', title='Result', icon='✔️')
+    ]
+})
+pg.run()

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+streamlit
+streamlit_autorefresh
+streamlit_extras
+ffmpeg-python
+ffmpegcv
+moviepy
+torch
+torchvision
+torchaudio
+transformers
+yt-dlp
+openai-whisper
+faster-whisper
+SpeechRecognition
+PyAudio
+pydub
+librosa
+python-docx
+pandas
+matplotlib
+pyperclip
+scenedetect
+easyocr
+pytesseract

transcriber.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import whisper
+from tempfile import NamedTemporaryFile
+class Transcription:
+    def __init__(self, source):
+        self.source = source
+        # self.device = device
+        self.audios = []
+        #     with NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
+        #         tmp_file.write(file.getvalue())
+        #         # self.audios.append(tmp_file.name)
+        #         self.audios.append(tmp_file)
+        self.audios.append(source)
+    def transcribe(
+        self,
+        model
+        # whisper_model_option: str,
+        # translation: bool,
+    ):
+        # # Get the whisper model
+        # transcriber = whisper.load_model(whisper_model_option, device=self.device)
+        self.output = []
+        for idx, _ in enumerate(self.audios):
+            # identify language
+            audio = whisper.load_audio(self.audios[idx])
+            audio = whisper.pad_or_trim(audio)
+            # print(model.__dict__)
+            # n_mels = 128 if 'large' in model.name else 80
+            mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
+            _, probs = model.detect_language(mel)
+            language = max(probs, key=probs.get)
+            self.raw_output = model.transcribe(
+                self.audios[idx],
+                language=language,
+                verbose=True,
+                word_timestamps=True,
+                # fp16=(model.device == 'cuda')  # use fp16 on GPU for speed/memory
+            )
+            # if(translation):
+            #     self.translation = model.transcribe(
+            #         self.audios[idx],
+            #         language=language,
+            #         verbose=True,
+            #         word_timestamps=True,
+            #         task='translate'
+            #     )["text"]
+            #     self.raw_output["translation"] = self.translation
+            self.segments = self.raw_output['segments']
+            for segment in self.raw_output['segments']:
+                del segment['tokens']
+            self.raw_output.update(
+                name=self.source[idx],#.name,
+                language=language
+            )
+            self.output.append(self.raw_output)
+            print(self.raw_output['segments'])

ui_create_summary.py ADDED Viewed

File without changes

ui_home.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import streamlit as st
+st.title('📚 Conspectum: Video Lectures Summarization 📝')
+st.markdown('''
+Welcome to the Video Lecture Summarizer app!
+''')

ui_result.py ADDED Viewed

File without changes

ui_summarize.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import streamlit as st
+import os
+from transformers import pipeline
+import time
+from docx import Document
+from io import BytesIO
+os.environ['STREAMLIT_SERVER_ENABLE_FILE_WATCHER'] = 'false'
+import torch
+from langchain_ollama.llms import OllamaLLM
+# from utils import cleanup_session_files, get_session_id  # for cleanup button
+st.title("📝 Step 4: Lecture Notes Summarization & Structuring")
+# Check if transcript and potentially OCR text are available
+transcript_available = 'transcript' in st.session_state and st.session_state['transcript']
+frames_available = 'frames_dir' in st.session_state and st.session_state['frames_dir']
+if not transcript_available and not frames_available:
+    st.warning("No text content (Transcript or OCR) found. Please complete previous steps first.")
+    st.stop()
+# st.info("This step combines the generated transcript and OCR text (if available) and creates a summary.")
+# --- Combine Sources ---
+st.subheader('Sources')
+# combined_text = ""
+source_info = []
+if transcript_available:
+    st.success('✅ Transcript found')
+    # st.success(len(st.session_state.transcript.__dict__['output']))
+    # st.success(st.session_state.transcript.__dict__['output'][0]['text'])
+    # combined_text += '--- Transcript ---\n' + st.session_state.transcript['output'][0]['text'] + '\n\n'
+    # st.success(st.session_state.transcript.output[0]['text'])
+    transcript_text = st.session_state.transcript.output[0]['text']
+    # combined_text += '--- Transcript ---\n\n' + transcript_text + '\n\n'
+    # st.write(combined_text)
+    source_info.append('Transcript')
+    with st.expander('Show Transcript'):
+        st.text_area('Transcript', transcript_text, height=200, key='sum_transcript_disp')
+else:
+    st.warning("Transcript not available.")
+if frames_available:
+    st.success("✅ Extracted frames found")
+    # combined_text += "--- OCR results ---\n" + st.session_state['frames_dir']
+    source_info.append('Frames dir')
+    # with st.expander('Extracted frames directory'):
+    # st.text_area('Extracted frames directory', st.session_state['frames_dir'], height=200, key="sum_ocr_disp")
+    # st.text_area('Extracted frames directory', st.session_state['frames_dir'], height=200, key="sum_ocr_disp")
+    st.text_input('Extracted frames directory', st.session_state['frames_dir'])
+else:
+    st.warning("OCR Text not available.")
+# combined_text = combined_text.strip()
+# if not combined_text:
+#     st.error("Combined text is empty. Cannot proceed.")
+if not transcript_text:
+    st.error('Transcript text is empty. Cannot proceed.')
+    st.stop()
+# --- Summarization Configuration ---
+st.subheader('Summarization Settings')
+# Consider different models/pipelines
+summarizer_options = ['gemma3',
+                      # 'gemma3:27b',
+                      'phi4',
+                      'mistral-small3.1',
+                      # 'YandexGPT',
+                      # 't5-base',
+                      # 't5-large',
+                      # 'facebook/mbart-large-50',
+                      # 'facebook/bart-large-cnn',
+                      # 'google/pegasus-xsum',
+                      ]
+# Note: Models like Pegasus/XSUM produce very short, abstractive summaries. BART/CNN is better for longer summaries. T5 is versatile.
+selected_model = st.selectbox('Select Summarization Model:', summarizer_options, index=0)
+# # Dynamic length based on input size (example logic)
+# # input_length = len(combined_text.split())
+# input_length = len(transcript_text.split())  # approx word count
+# default_min = max(50, input_length // 10)  # suggest min length ~10% of input
+# default_max = max(150, input_length // 3)  # suggest max length ~30% of input
+# min_length = st.slider("Minimum Summary Length (tokens):", min_value=30, max_value=max(500, default_max + 100), value=default_min)
+# max_length = st.slider("Maximum Summary Length (tokens):", min_value=50, max_value=max(1000, default_max + 200), value=default_max)
+# if min_length >= max_length:
+#     st.warning("Minimum length should be less than maximum length.")
+#     # Adjust max_length automatically or prevent proceeding
+#     max_length = min_length + 50  # simple adjustment
+# --- Generate Summary ---
+def describe_video(model, frames_dir, describe_prompt):
+    images = []
+    for file in os.listdir(frames_dir):
+        images.append(os.path.join(frames_dir, file))
+    model_with_images = model.bind(images=images)
+    return model_with_images.invoke(describe_prompt)
+with st.expander('**Prompt**', expanded=True):
+    # col_1, col_2 = st.columns(2)
+    describe_prompt = st.text_area(label='Промпт', height=300, value='''
+Ты - ассистент, который создает конспекты лекций на основе предоставленного текста. Этот текст состоит из двух частей: 1. транскрибация аудио-дорожки видеолекции, 2. Изображение выделенных из видео ключевых кадров, с полезной информацией.
+Сделай детальный конспект по тому, что описывается в видео. Для иллюстрации сравнений и сопоставлений используй markdown-таблицы. Ответ предоставь в формате markdown.
+Придерживайся следующей структуры:
+## Содержание:
+1. [Название темы 1](###Название_темы_1) (таймкод начала)
+2. [Название темы 2](###Название_темы_2) (таймкод начала)
+...
+## Краткий конспект:
+### Название_темы_1
+[Текст из транскрипции, относящийся к этой теме]
+[Формулы, относящиеся к этой теме]
+[Таблицы, относящиеся к этой теме]
+---
+### Название_темы_2
+[Текст из транскрипции, относящийся к этой теме]
+[Формулы, относящиеся к этой теме]
+[Таблицы, относящиеся к этой теме]
+---
+…
+Здесь необходимо обратить внимание на следующие детали:
+1. правильно подобрать названия тем
+2. написать сжатый текст, оставляя (без сильного переформулирования) важную информацию.
+3. на основе предоставленного транскрибированного аудио и текста со слайдов попытайся составить таблицы в стиле markdown. Для этого проанализируй упомянутые ключевые термины и попытайся понять как их можно сравнить.
+4. Если ты понимаешь, что на некотором слайде должна быть ВАЖНАЯ формула (непосредственно относящаяся к теме занятия), которую плохо транскрибировали (или пропустили, хотя лектор её проговаривал/упоминал), то можешь привести её самостоятельно, если знаешь о ней. При этом подпиши под ней, что формулу написал ты.
+Вот упомянутый транскрибированный текст:
+    ''')
+_, col_button_summary, _ = st.columns([2, 1, 2])
+if col_button_summary.button('Generate Summary', type='primary', use_container_width=True):
+    st.session_state['summary'] = None  # clear previous summary
+    with st.spinner(f'Performing summarization with `{selected_model}` model..'):
+        st.session_state.summary = describe_video(model=OllamaLLM(model=selected_model),
+                                                  frames_dir=st.session_state.frames_dir,
+                                                  describe_prompt=describe_prompt + transcript_text)
+        # if combined_text:
+        #     with st.spinner(f"Summarizing text using {selected_model}.. Может занять некоторое время (до x2)"):
+        #         try:
+        #             start_time = time.time()
+        #             # Load the pipeline - specify device if possible
+        #             device = 0 if torch.cuda.is_available() else -1  # device=0 for first GPU, -1 for CPU
+        #             summarizer = pipeline("summarization", model=selected_model, device=device)
+        #             # Handle potential long input (simplistic chunking if needed, better models handle longer inputs)
+        #             # Basic check: Transformers often have input limits (e.g., 1024 tokens for BART).
+        #             # A more robust solution involves chunking, summarizing chunks, and combining summaries.
+        #             # For this example, we'll try summarizing directly, but add a warning.
+        #             max_model_input_length = getattr(summarizer.model.config, 'max_position_embeddings', 1024)  # get model's max length
+        #             if len(summarizer.tokenizer.encode(combined_text)) > max_model_input_length:
+        #                 st.warning(f'Input text might be too long for {selected_model} (max ~{max_model_input_length} tokens).' +
+        #                            f'Consider using models designed for longer text or implementing chunking.')
+        #                 # Simple Truncation (Not Ideal):
+        #                 # truncated_text = summarizer.tokenizer.decode(summarizer.tokenizer.encode(combined_text, max_length=max_model_input_length, truncation=True))
+        #                 # summary_result = summarizer(truncated_text, max_length=max_length, min_length=min_length, do_sample=False)
+        #             # Attempt summarization (may error if too long and not handled)
+        #             summary_result = summarizer(combined_text, max_length=max_length, min_length=min_length, do_sample=False)
+        #             st.session_state['summary'] = summary_result[0]['summary_text']
+        #             end_time = time.time()
+        #             st.success(f"Summary generated in {end_time - start_time:.2f} seconds.")
+        #         except Exception as e:
+        #             st.error(f"Error during summarization: {e}")
+        #             st.error("This could be due to model loading issues, insufficient memory, or input text length.")
+        #             if 'summarizer' in locals():
+        #                 del summarizer  # try to free memory
+        #                 if device == 0: torch.cuda.empty_cache()
+        # else:
+        #     st.error("No text available to summarize.")
+# --- Display and Refine Summary ---
+# st.subheader('Summary')
+if 'summary' in st.session_state and st.session_state['summary']:
+    with st.container(height=600, border=True):
+        summary_container = st.empty()
+        edited_summary = st.session_state['summary']
+        # summary_container.markdown(st.session_state['summary'])
+        summary_container.markdown(edited_summary, unsafe_allow_html=True)
+    _, col_button_render, _ = st.columns([2, 1, 2])
+    # Use st.text_area for editing
+    edited_summary = st.text_area(
+        'Edit the summary here (Markdown format supported):',
+        value=st.session_state['summary'],
+        height=400,
+        key='summary_edit_area'
+    )
+    if col_button_render.button('Render Markdown', type='secondary', use_container_width=True):
+        with st.spinner('Generating Markdown preview..'):
+            # st.markdown(edited_summary, unsafe_allow_html=True)
+            summary_container.markdown(edited_summary, unsafe_allow_html=True)
+            # st.session_state['summary'] = edited_summary  # update summary
+    # else:
+    #     st.markdown('', unsafe_allow_html=True)
+    # --- Export Options ---
+    st.subheader('📥 Export Notes (Download)')
+    col_export_md, col_export_docx, col_export_pdf = st.columns(3)
+    st.session_state['final_notes'] = edited_summary  # store edited version
+    final_notes_md = st.session_state.get('final_notes', '')
+    # 1. Markdown (.md) export
+    col_export_md.download_button(
+        label="📥 Markdown (.md)",
+        data=final_notes_md,
+        file_name="lecture_notes.md",
+        mime="text/markdown",
+        use_container_width=True,
+    )
+    # 2. Word (.docx) export
+    try:
+        doc = Document()
+        doc.add_heading('Lecture Notes Summary', 0)
+        # Add basic Markdown conversion (very simple - assumes paragraphs)
+        # For full Markdown -> Docx, a library like 'pandoc' (external) or more complex parsing is needed.
+        paragraphs = final_notes_md.split('\n\n')  # split by double newline
+        for para in paragraphs:
+            if para.strip():  # avoid empty paragraphs
+                # Basic handling for potential markdown emphasis (crude)
+                # A proper Markdown parser would be better here
+                cleaned_para = para.replace('*', '').replace('_', '').replace('#', '').strip()
+                doc.add_paragraph(cleaned_para)
+        # Save docx to a BytesIO buffer
+        buffer = BytesIO()
+        doc.save(buffer)
+        buffer.seek(0)
+        col_export_docx.download_button(
+            label='📥 Word (.docx)',
+            data=buffer,
+            file_name='lecture_notes.docx',
+            mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+            use_container_width=True
+        )
+    except Exception as docx_e:
+        st.error(f'Failed to generate .docx file: {docx_e}')
+    # 3. PDF (.pdf) export
+    try:
+        col_export_pdf.download_button(
+            label='📥 PDF (.pdf)',
+            data=buffer,
+            file_name="lecture_notes.pdf",
+            use_container_width=True,
+            # mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+            disabled=True
+        )
+    except Exception as pdf_e:
+        st.error(f'Failed to generate .pdf file: {pdf_e}')
+    # 3. PDF Export (Requires extra libraries/setup - Placeholder)
+    # st.markdown("---")
+    # st.write("**PDF Export:**")
+    # try:
+    #     from mdpdf.cli import mdpdf
+    #     pdf_buffer = BytesIO()
+    #     # This often requires command-line execution or careful API usage
+    #     # Simplified placeholder - actual implementation may vary:
+    #     # mdpdf(pdf_buffer, md=final_notes_md, ...) # Fictional direct API call
+    #     st.info("PDF generation via libraries like mdpdf/WeasyPrint requires setup.")
+    # except ImportError:
+    #      st.warning("`mdpdf` library not installed. PDF export unavailable.")
+    # except Exception as pdf_e:
+    #      st.error(f"Failed to generate PDF (requires setup): {pdf_e}")
+else:
+    st.info('Summary has not been generated or is empty.')
+# --- Optional: Cleanup Button ---
+# st.sidebar.markdown("---")
+# if st.sidebar.button("End Session & Clean Up Files"):
+#     session_id = get_session_id()
+#     cleanup_session_files(session_id)
+#     # Clear relevant session state keys
+#     keys_to_clear = ['video_path', 'audio_path', 'frames_dir', 'transcript', 'summary', 'final_notes', 'extracted_frames', 'session_id']
+#     for key in keys_to_clear:
+#         if key in st.session_state:
+#             del st.session_state[key]
+#     st.success("Temporary files cleaned and session data cleared.")
+#     st.info("You can now start a new session from the 'Main' page.")
+#     # Consider navigating back to Main page or just showing message

ui_transcribe.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import streamlit as st
+from streamlit_extras.stylable_container import stylable_container
+import os
+import time
+import pathlib
+from datetime import timedelta
+os.environ['STREAMLIT_SERVER_ENABLE_FILE_WATCHER'] = 'false'
+import whisper  # openai-whisper
+import torch  # check for GPU availability
+# from models.loader import load_model_sst
+from transcriber import Transcription
+import matplotlib.colors as mcolors
+st.title('🎙️ Step 2: Speech-to-Text (ASR/STT)')
+# Check if audio path exists from previous step
+if 'audio_path' not in st.session_state or not st.session_state['audio_path'] or not os.path.exists(st.session_state['audio_path']):
+    st.warning('Audio file not found. Please go back to the "**📤 Upload**" page and process a video first.')
+    st.stop()
+audio_path = st.session_state['audio_path']
+# st.write(f'Audio file to process: `{os.path.basename(audio_path)}`')
+st.write(f'Processing audio `{st.session_state.video_input_title}` from video input')
+if 'start_time' not in st.session_state:
+    st.session_state.start_time = 0
+# st.audio(audio_path)
+# format='audio/wav',
+st.audio(audio_path, start_time=st.session_state.start_time)
+#
+# ==================================================================
+#
+col_model, col_config = st.columns(2)
+# --- Model ---
+# with col_model.expander('**MODEL**', expanded=True):
+with col_model.container(border=True):
+    model_option = st.selectbox(
+        'SST Model:',
+        ['whisper', 'faster-whisper', 'distill-whisper', 'giga'],
+        index=0
+    )
+# sst_model = load_model_sst(model_option)
+# --- Configuration ---
+with col_config.expander('**CONFIG**', expanded=True):
+    # Determine device
+    default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = st.radio(
+        'Compute device:',
+        ('cuda', 'cpu'),
+        index=0 if default_device == 'cuda' else 1,
+        horizontal=True,
+        disabled=not torch.cuda.is_available()
+    )
+    if device == 'cuda' and not torch.cuda.is_available():
+        st.warning('CUDA selected but not available, falling back to CPU')
+        device = 'cpu'
+    whisper_model_option = st.selectbox(
+        'Whisper model type:',
+        ['tiny', 'base', 'small', 'medium', 'large', 'turbo'],
+        index=5
+    )
+    pauses = st.checkbox('pauses', value=False)
+    # from models.models_sst import Whisper
+    # Whisper.config()
+##
+## --- Transcription ---
+##
+_, col_button_trancribe, _ = st.columns([2, 1, 2])
+if col_button_trancribe.button('Transcribe', type='primary', use_container_width=True):
+    # if input_files:
+        # pass
+    # else:
+    #     st.error("Please select a file")
+    st.session_state.transcript = None  # clear previous transcript
+    col_info, col_complete, col_next = st.columns(3)
+    try:
+        with st.spinner(f'Loading Whisper `{whisper_model_option}` model and transcribing..'):
+            #-- Load whisper model
+            start = time.time()
+            # Let Whisper handle device placement if possible
+            model = whisper.load_model(whisper_model_option, device=device)
+            # load_time =
+            col_info.info(f'Model loaded in {time.time() - start:.2f} seconds.')
+            #-- Perform transcription
+            start = time.time()
+            # print('################################')
+            # print(st.session_state.audio_path)
+            # print('################################')
+            st.session_state.transcript = Transcription(st.session_state.audio_path)
+            # st.session_state.transcript = Transcription([audio_path])
+            # st.session_state.transcript.transcribe(whisper_model_option)
+            # st.markdown(model.name)
+            st.session_state.transcript.transcribe(model)
+            # result = model.transcribe(audio_path, fp16=(device == 'cuda'))  # use fp16 on GPU for speed/memory
+            transcribe_time = time.time() - start
+        # st.session_state['transcript'] = result['text']
+        # st.session_state['transcript'] = st.session_state.transcript
+        # Store segments for timestamping/structuring later
+        # print(len(st.session_state.transcript['segments']))
+        # st.session_state['transcript_segments'] = st.session_state.transcript['segments']
+        col_complete.success(f'Transcription complete! (Took {transcribe_time:.2f}s)')
+        col_next.page_link('ui_video.py', label='Next Step: **🖼️ Analyze Video**', icon='➡️')
+    except Exception as e:
+        st.error(f'An error occurred during transcription: {e}')
+        # Consider unloading model if error occurs to free memory
+        if 'model' in locals():
+            del model
+            if device == 'cuda':
+                torch.cuda.empty_cache()
+# --- Video Player ---
+with st.expander('**Video Player**', expanded=True):
+    col_video, col_segments = st.columns(2)
+    col_video.video(st.session_state.video_path, start_time=st.session_state.start_time)
+# --- Display Transcript ---
+if 'transcript' in st.session_state and st.session_state['transcript']:
+    st.markdown('#### Transcription')
+    output = st.session_state.transcript.output[0]
+    # doc = docx.Document()
+    avg_confidence_score = 0
+    amount_words = 0
+    save_dir = str(pathlib.Path(__file__).parent.absolute()) + '/transcripts/'
+    for idx, segment in enumerate(output['segments']):
+        for w in output['segments'][idx]['words']:
+            amount_words += 1
+            avg_confidence_score += w['probability']
+    st.badge(
+        f'whisper model: **`{whisper_model_option}`** | ' +
+        f'language: **`{output["language"]}`** | ' +
+        f'confidence score: **`{round(avg_confidence_score / amount_words, 3)}`**'
+    )
+    prev_word_end = -1
+    text = ""
+    html_text = ""
+    # Define the color map
+    colors = [(0.6, 0, 0), (1, 0.7, 0), (0, 0.6, 0)]
+    cmap = mcolors.LinearSegmentedColormap.from_list('my_colormap', colors)
+    with st.expander('**TRANSCRIPT**', expanded=True):
+        color_coding = st.checkbox(
+            'color coding',
+            value=True,
+            # key={i},
+            help='Цветное кодирование слов в зависимости от вероятности правильного распознавания: от зелёного (хорошо) до красного (плохо)'
+        )
+        # https://docs.streamlit.io/develop/api-reference/layout/st.container
+        with st.container(height=300, border=False):
+            for idx, segment in enumerate(output['segments']):
+                for w in output['segments'][idx]['words']:
+                    # check for pauses in speech longer than 3s
+                    if pauses and prev_word_end != -1 and w['start'] - prev_word_end >= 3:
+                        pause = w['start'] - prev_word_end
+                        pause_int = int(pause)
+                        html_text += f'{"." * pause_int}{{{pause_int}sec}}'
+                        text += f'{"." * pause_int}{{{pause_int}sec}}'
+                    prev_word_end = w['end']
+                    if (color_coding):
+                        rgba_color = cmap(w['probability'])
+                        rgb_color = tuple(round(x * 255)
+                                          for x in rgba_color[:3])
+                    else:
+                        rgb_color = (0, 0, 0)
+                    html_text += f"<span style='color:rgb{rgb_color}'>{w['word']}</span>"
+                    text += w['word']
+                    # insert line break if there is a punctuation mark
+                    if any(c in w['word'] for c in '!?.') and not any(c.isdigit() for c in w['word']):
+                        html_text += '<br><br>'
+                        text += '\n\n'
+            st.markdown(html_text, unsafe_allow_html=True)
+            # doc.add_paragraph(text)
+        # if (translation):
+        #     with st.expander("English translation"):
+        #         st.markdown(output["translation"], unsafe_allow_html=True)
+        # # save transcript as docx. in local folder
+        # file_name = output['name'] + "-" + whisper_model + \
+        #     "-" + datetime.today().strftime('%d-%m-%y') + ".docx"
+        # doc.save(save_dir + file_name)
+        # bio = io.BytesIO()
+        # doc.save(bio)
+        # st.download_button(
+        #     label="Download Transcription",
+        #     data=bio.getvalue(),
+        #     file_name=file_name,
+        #     mime="docx"
+        # )
+    # --- Display Segments with timestamps ---
+    # if 'segments' in st.session_state.transcript:
+    # with st.expander('Detailed segments (with timestamps)'):
+    #     st.json(st.session_state.transcript['segments'])
+    format_time = lambda s: str(timedelta(seconds=int(s)))
+    # st.write(st.session_state.transcript.output[0]['segments'])
+    # https://discuss.streamlit.io/t/replaying-an-audio-file-with-a-timecode-click/48892/9
+    # with col_segments.expander('**SEGMENTS**', expanded=True):
+    # with col_segments.container('**SEGMENTS**', expanded=True):
+        # https://docs.streamlit.io/develop/api-reference/layout/st.container
+    with col_segments.container(height=400, border=False):
+        # Style buttons as links
+        with stylable_container(
+            key='link_buttons',
+            css_styles='''
+            button {
+                background: none!important;
+                border: none;
+                padding: 0!important;
+                font-family: arial, sans-serif;
+                color: #069;
+                cursor: pointer;
+            }
+            ''',
+        ):
+            for i, segment in enumerate(st.session_state.transcript.output[0]['segments']):
+                start = format_time(segment['start'])
+                end = format_time(segment['end'])
+                text = segment['text'].strip()
+                # 🕒Segment {i + 1}
+                # st.badge(f'**[{start} - {end}]** {text}', color='gray')
+                # st.markdown(
+                #     f':violet-badge[**{start} - {end}**] :gray-badge[{text}]'
+                # )
+                col_timecode, col_text = st.columns([1, 5])
+                # seg_text = f':violet-badge[**{start} - {end}**] :gray-badge[{text}]'
+                if col_timecode.button(f':violet-badge[**{start} – {end}**]', use_container_width=True):
+                    st.session_state['start_time'] = start
+                    st.rerun()
+                # col_text.markdown(f':gray-badge[`{text}`]')
+                # col_text.write('#')
+                # col_text.markdown(f'<div style="text-align: bottom;">:gray-badge[{text}]</div>', unsafe_allow_html=True)
+                col_text.text(f'{text}')
+                # col_text.badge(text, color='gray')
+# else:
+#     st.info('Transcript has not been generated yet.')

ui_upload.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import streamlit as st
+import os
+import time
+from yt_dlp import YoutubeDL
+import ffmpeg
+import tempfile
+from utils import (save_uploaded_file, extract_audio,
+                   download_youtube, get_session_dir,
+                   cleanup_session_files, get_session_id,
+                   get_temp_dir, get_features, proc_raw_audio)
+st.title('📥📄 Step 1: Upload Video & Preprocess')
+# Initialize session state defaults
+defaults = {
+    'uploaded_file': None,
+    'video_path': None,
+    'audio_path': None,
+    'ocr_text': None,
+    'transcript': None,
+    'summary': None,
+    'main_topic': None,
+    'input_method': 'Upload',
+    'input_title': None,
+    'video_input_path': None,
+    'video_url': None,
+    'audio_wav': None,
+    'audio_file': None,
+}
+for key, value in defaults.items():
+    st.session_state.setdefault(key, value)
+# --- Option to clear previous session ---
+st.sidebar.write('Current Session ID:')
+st.sidebar.write(f'`{get_session_id()}`')  # session ID for debugging
+if st.sidebar.button('Start New Session'):
+    session_id = get_session_id()  # get current ID before clearing
+    cleanup_session_files(session_id)
+    for key in list(st.session_state.keys()):
+        del st.session_state[key]  # clear all session state
+    st.rerun()  # rerun the script to reflect cleared state
+# --- Main Topic ---
+st.session_state.main_topic = st.text_input('Enter video topic:', st.session_state.main_topic)
+    # st.session_state.main_topic = m
+    # col_url, col_start_from = st.columns([5, 2])
+    # video_url = col_url.text_input('Enter YouTube video URL:', example_youtube['url'])
+    # start_from = col_start_from.number_input(
+    #     'Start From:',
+    #     min_value=0.0, step=0.5, format='%f', value=example_youtube['start'],
+    #     help='Time shift from the beginning (in seconds)'
+    # )
+    # if video_url:
+    #     st.session_state.video_url = video_url
+    #     st.session_state.video_input_path = ''  # clear path if URL is used
+# --- Video source selection ---
+input_method = st.radio(
+    'Select Input Method:',
+    ('Upload', 'YouTube'),
+    key='input_method',
+    horizontal=True
+)
+video_path = None
+uploaded_file = None
+video_url = None
+if input_method == 'Upload':
+    uploaded_file = st.file_uploader(
+        'Choose a video file',
+        type=['mp4', 'avi', 'mkv', 'mov']
+    )
+    if uploaded_file:
+        col_info, col_ready = st.columns(2)
+        # Display basic file info
+        col_info.info('**[ File Details ]** ' +
+                      f'name: `{uploaded_file.name}` | ' +
+                      f'type: `{uploaded_file.type}` | ' +
+                      f'size: `{uploaded_file.size / (1024 * 1024):.2f} MB`')
+        # Save uploaded file temporarily for the Prefect flow
+        temp_dir = get_temp_dir()  # use a shared temp location
+        # Use a unique name to avoid conflicts if multiple users run simultaneously
+        target_path = os.path.join(temp_dir, f'upload_{get_session_id()}_{uploaded_file.name}')
+        try:
+            with open(target_path, 'wb') as f:
+                f.write(uploaded_file.getbuffer())
+            st.session_state.video_input_path = target_path
+            st.session_state.video_input_title = uploaded_file.name
+            st.session_state.video_url = ''  # clear URL if file is uploaded
+            st.session_state.transcript = None
+            st.session_state.summary = None
+            col_ready.info('Ready for processing.')
+        except Exception as e:
+            col_ready.error(f'Error saving uploaded file: {e}')
+            st.session_state.video_input_path = ''
+elif input_method == 'YouTube':
+    #-- Obtain audio from YouTube video
+    example_youtube = {
+        'title':  'Общественное движение',
+        'url':    'https://www.youtube.com/watch?v=c3bhkrKF6F4',
+        'start':  0.0
+    }
+    col_url, col_start_from = st.columns([5, 2])
+    video_url = col_url.text_input('Enter YouTube video URL:', example_youtube['url'])
+    start_from = col_start_from.number_input(
+        'Start From:',
+        min_value=0.0, step=0.5, format='%f', value=example_youtube['start'],
+        help='Time shift from the beginning (in seconds)'
+    )
+    if video_url:
+        st.session_state.video_url = video_url
+        st.session_state.video_input_path = ''  # clear path if URL is used
+@st.cache_resource
+def ui_processed_sound(audio_wav, audio_np):
+    '''UI to show sound processing results'''
+    st.audio(audio_wav)
+    features = get_features(audio_np)
+@st.cache_resource
+def extract_videofile(video_file):
+    # video_buffer = BytesIO(video_file.read())
+    # audio_data = VideoFileClip(video_buffer.name).audio
+    # raw_source = StringIO(video_file.getvalue().decode('utf-8'))
+    # raw_source = video_file.getvalue().decode('utf-8')
+    # raw_source = video_file.read()
+    # raw_source = BytesIO(video_file.getvalue())
+    #-- Get video
+    # out, err = (
+    #     ffmpeg
+    #     .input(video_file, ss=start_from)
+    #     .output('temp.mp4', vcodec='copy')
+    #     .overwrite_output()
+    #     .run()
+    # )
+    # st.video('temp.mp4')
+    # video = VideoFileClip(video_file)
+    # audio = video.audio
+    # audio.write_audiofile('output_audio.mp3')
+    tfile = tempfile.NamedTemporaryFile(delete=False)
+    tfile.write(video_file.read())
+    #-- Get audio
+    # SAMPLE_RATE = 16000
+    audio_data, err = (
+        ffmpeg
+        .input(tfile.name, ss=start_from)
+        .output('pipe:', format='wav')#, acodec='pcm_s16le')
+        # .output('pipe:', format='s16le', ac=1, acodec='pcm_s16le', ar=SAMPLE_RATE)
+        # .global_args('-nostdin', '-threads', '0')
+        .run(capture_stdout=True)
+    )
+    if err:
+        raise RuntimeError(f'Failed to load audio: {err.decode()}')
+    return audio_data
+@st.cache_resource
+def extract_youtube(raw_url):
+    #-- Get video
+    # out, err = (
+    #     ffmpeg
+    #     .input(raw_url, ss=start_from)
+    #     .output('temp.mp4', vcodec='copy')
+    #     .overwrite_output()
+    #     .run()
+    # )
+    # st.video('temp.mp4')
+    #-- Get audio
+    # SAMPLE_RATE = 16000
+    audio_data, err = (
+        ffmpeg
+        .input(raw_url, ss=start_from)
+        .output('pipe:', format='wav')#, acodec='pcm_s16le')
+        # .output('pipe:', format='s16le', ac=1, acodec='pcm_s16le', ar=SAMPLE_RATE)
+        .global_args('-nostdin', '-threads', '0')
+        .run(capture_stdout=True)
+    )
+    if err:
+        raise RuntimeError(f'Failed to load audio: {err.decode()}')
+    return audio_data
+# --- Processing Button ---
+if st.button('Process video input',
+             type='primary',
+             disabled=not (st.session_state.video_input_path or st.session_state.video_url)
+             ):
+    # Clear previous paths if reprocessing
+    st.session_state['video_path'] = None
+    st.session_state['audio_path'] = None
+    col_info, col_complete, col_next = st.columns(3)
+    with st.spinner('Processing video input..'):
+        if st.session_state['input_method'] == 'Upload' and uploaded_file:
+            st.session_state.uploaded_file = uploaded_file
+            video = uploaded_file
+            # audio_data = extract_videofile(uploaded_file)
+            saved_path = save_uploaded_file(uploaded_file)
+            if saved_path:
+                st.session_state['video_path'] = saved_path
+                col_info.success(f'Video saved temporarily to: {os.path.basename(saved_path)}')
+            else:
+                col_info.error('Failed to save uploaded file')
+        elif st.session_state['input_method'] == 'YouTube' and video_url:
+            try:
+                with YoutubeDL({'format': 'best+bestaudio'}) as ydl:
+                    info = ydl.extract_info(video_url, download=False)
+            except Exception as e:
+                st.error(e)
+            else:
+                st.write(f"<small><div style='float: center; text-align: center'>\
+                         **Title:** [{info['title']}]({video_url})\
+                         **Duration:** {info['duration']} sec.</div></small>",
+                         unsafe_allow_html=True)
+                video = video_url
+                # audio_data = extract_youtube(info['url'])
+                st.session_state.video_input_title = info['title']
+                session_dir = get_session_dir()
+                os.makedirs(session_dir, exist_ok=True)
+                downloaded_path = download_youtube(video_url, session_dir)
+                if downloaded_path and os.path.exists(downloaded_path):
+                    st.session_state['video_path'] = downloaded_path
+                    col_info.success(f'YouTube video downloaded: {os.path.basename(downloaded_path)}')
+                else:
+                    col_info.error('Failed to download YouTube video')
+        else:
+            st.warning('Please upload a file or provide a YouTube URL')
+            st.stop()
+        # --- Basic Preprocessing: Audio Extraction ---
+        if st.session_state['video_path']:
+            # st.write('Extracting audio..')
+            start = time.time()
+            # Ensure utils.extract_audio uses the correct path
+            audio_path = extract_audio(st.session_state['video_path'])
+            end = time.time()
+            if audio_path and os.path.exists(audio_path):
+                st.session_state['audio_path'] = audio_path
+                col_info.success(f'Audio extracted to: {os.path.basename(audio_path)} (took {end - start:.2f}s)')
+            else:
+                col_info.error('Failed to extract audio from the video')
+                st.warning('Proceeding without audio. STT step will be skipped')
+                st.session_state['audio_path'] = None  # explicitly set to None
+    if st.session_state['video_path']:
+        col_complete.info('Preprocessing complete')
+        col_next.page_link('ui_transcribe.py', label='Next Step: 🎙️ **Transcribe**', icon='➡️')
+    # Display video
+    st.subheader('Video Player')
+    _, col_video, _ = st.columns([1, 3, 1])
+    col_video.video(video)
+    # audio_data = audio_path
+    # audio_wav, audio_np = proc_raw_audio(audio_data)
+    # st.session_state.audio_wav = audio_wav
+    # st.session_state.audio_np = audio_np
+    # # st.session_state.video = video.read()
+    # ui_processed_sound(audio_wav, audio_np)
+# # Display current status
+# st.subheader("Current Status:")
+# if st.session_state.get('video_path'):
+#     st.success(f"✅ Video Loaded: {os.path.basename(st.session_state['video_path'])}")
+# else:
+#     st.warning("⏳ Video not yet loaded or processed.")
+# if st.session_state.get('audio_path'):
+#     st.success(f"✅ Audio Extracted: {os.path.basename(st.session_state['audio_path'])}")
+# elif st.session_state.get('video_path'):  # only show warning if video was loaded but audio failed
+#     st.warning("⚠️ Audio extraction failed or video has no audio track.")

ui_video.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import streamlit as st
+import os
+import pytesseract
+from PIL import Image
+import time
+from utils import extract_frames_interval, extract_frames_pyscenedetect
+st.title('🖼️ Step 3: Video Processing (Frame Extraction & OCR)')
+# Check if video path exists
+if ('video_path' not in st.session_state or
+    not st.session_state['video_path'] or
+    not os.path.exists(st.session_state['video_path'])
+    ):
+    st.warning('Video file not found. Please go back to the **📤 Upload** page and process a video first.')
+    st.stop()
+video_path = st.session_state['video_path']
+st.write(f'Video file to process: `{os.path.basename(video_path)}`')
+#
+# ==================================================================
+#
+col_method, col_config = st.columns(2)
+# --- Method ---
+# with col_model.expander('**MODEL**', expanded=True):
+with col_method.container(border=True):
+    # extraction_method = st.selectbox(
+    #     'Extraction method:',
+    #     ('interval', 'video2slides', 'pyscenedetect'),
+    #     index=0
+    # )
+    extraction_method = st.radio(
+        'Extraction method:',
+        ('interval', 'video2slides', 'pyscenedetect'),
+        index=0,
+        horizontal=True,
+    )
+    # col_config_frame_interval, col_config_ocr_lang = st.columns(2)
+    # frame_interval = col_config_frame_interval.slider('Extract frames every `X` seconds:', min_value=1, max_value=60, value=5, step=1)
+    # ocr_lang = col_config_ocr_lang.text_input('OCR Language(s) (e.g. `rus`, `rus+eng`):', value='rus')
+    ocr_lang = st.text_input('OCR Language(s) (e.g. `rus`, `rus+eng`):', value='rus')
+# --- Configuration ---
+with col_config.expander(f'**`{extraction_method}` METHOD CONFIG**', expanded=True):
+    match extraction_method:
+        case 'interval':
+            extraction_interval = st.number_input(
+                'Frames extraction interval:',
+                min_value=0, max_value=25, step=1, format='%i', value=5,
+                help='Extract frames every `x` seconds'
+            )
+        case 'video2slides':
+            print('video2slides')
+        case 'pyscenedetect':
+            extraction_threshold = st.number_input(
+                'Frames extraction threshold:',
+                min_value=0.1, max_value=10.0, step=0.1, format='%f', value=2.0,
+            )
+# --- Semantic Segmentation Placeholder ---
+# st.markdown("---")
+# --- Tesseract Configuration (Optional but recommended) ---
+# Uncomment and set the path if tesseract is not in your PATH
+# pytesseract.pytesseract.tesseract_cmd = r'/path/to/your/tesseract' # Example: '/usr/bin/tesseract' or 'C:\Program Files\Tesseract-OCR\tesseract.exe'
+# # --- Frame Extraction and OCR ---
+# st.subheader('OCR')
+if st.button('Extract Frames'):
+    # st.session_state['ocr_text'] = None  # clear previous results
+    st.session_state['frames_paths'] = []
+    # all_ocr_results = []
+    col_info, col_complete, col_next = st.columns(3)
+    match extraction_method:
+        case 'interval':
+            with st.spinner(f'Extracting frames every {extraction_interval} seconds (using interval method)..'):
+                start_time = time.time()
+                frames_dir, frame_paths = extract_frames_interval(video_path, 'frames_pyscenedetect', interval_sec=extraction_interval)
+                extract_time = time.time() - start_time
+                if frames_dir and frame_paths:
+                    st.session_state['frames_dir'] = frames_dir
+                    st.session_state['frames_paths'] = frame_paths  # store paths
+                    col_info.success(f'Extracted {len(frame_paths)} frames in {extract_time:.2f}s.')
+                else:
+                    col_info.error('Failed to extract frames')
+                    st.stop()
+        case 'video2slides':
+            pass
+        case 'pyscenedetect':
+            with st.spinner(f'Extracting frames with `threshold={extraction_threshold}` (using pyscenedetect method)..'):
+                start_time = time.time()
+                frames_dir, frame_paths = extract_frames_pyscenedetect(video_path, 'frames_pyscenedetect', threshold=extraction_threshold)
+                extract_time = time.time() - start_time
+                if frames_dir and frame_paths:
+                    st.session_state['frames_dir'] = frames_dir
+                    st.session_state['frames_paths'] = frame_paths  # store paths
+                    col_info.success(f'Extracted {len(frame_paths)} frames in {extract_time:.2f}s.')
+                else:
+                    col_info.error('Failed to extract frames')
+                    st.stop()
+    if st.session_state['frames_paths']:
+        total_frames = len(st.session_state['frames_paths'])
+        col_info.write(f'Performing OCR on {total_frames} frames..')
+        ocr_progress = st.progress(0)
+        start_ocr_time = time.time()
+        extracted_texts = []
+        processed_count = 0
+        # Use columns to display some example frames and OCR
+        max_display_frames = 6
+        display_cols = st.columns(min(max_display_frames, total_frames) if total_frames > 0 else 1)
+        display_idx = 0
+        # Process frames in batches or one by one
+        for i, frame_path in enumerate(st.session_state['frames_paths']):
+            try:
+                img = Image.open(frame_path)
+                # --- Potential Preprocessing/Filtering ---
+                # Add logic here if needed:
+                # - Detect if frame likely contains text (e.g., check contrast, edges)
+                # - If segmentation was implemented, crop to slide regions here
+                # --- Perform OCR ---
+                text = pytesseract.image_to_string(img, lang=ocr_lang)
+                # --- Basic Text Cleaning/Filtering ---
+                cleaned_text = text.strip()
+                if cleaned_text and len(cleaned_text) > 10:  # filter very short/noisy results
+                    # Extract timestamp from filename (assuming format frame_XXXXXX.png)
+                    try:
+                        secs = int(os.path.basename(frame_path).split('_')[1].split('.')[0])
+                        timestamp = time.strftime('%H:%M:%S', time.gmtime(secs))
+                        extracted_texts.append({'timestamp': timestamp, 'text': cleaned_text})
+                    except:
+                        extracted_texts.append({'timestamp': 'N/A', 'text': cleaned_text})  # fallback if filename parse fails
+                    # Display some examples
+                    if display_idx < max_display_frames and display_idx < len(display_cols):
+                        with display_cols[display_idx]:
+                            st.image(img, caption=f'Frame (t={timestamp})', use_container_width=True)
+                            st.text(f'OCR:\n{cleaned_text[:100]}..')  # show snippet
+                        display_idx += 1
+                processed_count += 1
+                ocr_progress.progress(processed_count / total_frames)
+            except Exception as ocr_err:
+                col_info.warning(f'Could not perform OCR on {os.path.basename(frame_path)}: {ocr_err}')
+                processed_count += 1  # still count as processed
+                ocr_progress.progress(processed_count / total_frames)
+        ocr_time = time.time() - start_ocr_time
+        col_complete.success(f'OCR processing finished in {ocr_time:.2f}s.')
+        # --- Aggregate and Deduplicate OCR Text ---
+        # Simple approach: Combine unique text blocks
+        final_ocr_text = ""
+        seen_texts = set()
+        last_text = ""
+        min_similarity_threshold = 0.8  # requires a library like `thefuzz` or similar for proper check
+                                        # basic check: avoid exact consecutive duplicates
+        for item in extracted_texts:
+            current_text_block = item['text'].strip()
+            # Basic check: Only add if significantly different from the last block
+            # A more robust check would involve sequence matching or fuzzy matching
+            is_duplicate = False
+            if last_text:
+                # Simple check: exact match or near-exact length/content start?
+                if (current_text_block == last_text or
+                    (abs(len(current_text_block) - len(last_text)) < 10 and
+                     current_text_block.startswith(last_text[:20]))
+                    ):
+                    is_duplicate = True  # likely a duplicate from consecutive frames
+            if current_text_block and not is_duplicate:  # only add non-empty, non-duplicate text
+                final_ocr_text += f"\n\n--- Text from frame around {item['timestamp']} ---\n"
+                final_ocr_text += current_text_block
+                last_text = current_text_block  # update last text added
+        st.session_state['ocr_text'] = final_ocr_text.strip()
+        if st.session_state['ocr_text']:
+            col_complete.info('OCR processing complete.')
+            col_next.page_link('ui_summarize.py', label='Next Step: **📝 Summarize**', icon='➡️')
+        else:
+            col_complete.warning('No significant text found via OCR')
+# --- Display OCR Results ---
+st.subheader('Aggregated OCR Text')
+if 'ocr_text' in st.session_state and st.session_state['ocr_text']:
+    st.text_area("Extracted Text from Frames", st.session_state['ocr_text'], height=400)
+else:
+    st.info('OCR has not been run or no text was detected')
+# st.divider()
+# st.subheader('Semantic Segmentation')

utils.py ADDED Viewed

	@@ -0,0 +1,436 @@

+import os
+import tempfile
+# import ffmpeg
+from moviepy.video.io.VideoFileClip import VideoFileClip
+import cv2
+import uuid
+import tomllib
+from pathlib import Path
+import streamlit as st
+import numpy as np
+from io import BytesIO
+from pydub import AudioSegment
+from pydub.silence import detect_leading_silence
+import librosa
+import librosa.display as lbd
+import matplotlib.pyplot as plt
+TEMP_DIR = tempfile.mkdtemp()
+CONFIG_FILE = 'config.toml'
+def load_config():
+    """Loads configuration from config.toml"""
+    try:
+        with open(CONFIG_FILE, 'rb') as f:
+            return tomllib.load(f)
+    except FileNotFoundError:
+        print(f"Error: {CONFIG_FILE} not found. Using default settings.")
+        # Provide default fallback config if needed
+        return {
+            "paths": {"output_dir": "output", "temp_dir": "temp_processing"},
+            "models": {"whisper_model": "base.en", "ocr_languages": ["en"], "summarization_model": "google/pegasus-xsum"},
+            "settings": {"frame_extraction_interval_seconds": 10, "max_summary_length": 500, "min_summary_length": 100}
+        }
+    except Exception as e:
+        print(f"Error loading config: {e}")
+        raise  # Re-raise after printing
+CONFIG = load_config()
+def ensure_dir(directory_path):
+    """Creates a directory if it doesn't exist."""
+    Path(directory_path).mkdir(parents=True, exist_ok=True)
+def save_uploaded_file(uploaded_file):
+    """Saves an uploaded file to a temporary directory."""
+    if uploaded_file is not None:
+        # Generate a unique sub-directory for this upload
+        session_id = get_session_id()  # simple way to group files per session/upload
+        upload_dir = os.path.join(TEMP_DIR, session_id)
+        os.makedirs(upload_dir, exist_ok=True)
+        file_path = os.path.join(upload_dir, uploaded_file.name)
+        with open(file_path, 'wb') as f:
+            f.write(uploaded_file.getbuffer())
+        print(f'File saved to: {file_path}')  # debugging
+        return file_path
+    return None
+def get_session_id():
+    """Generates or retrieves a unique session ID."""
+    if 'session_id' not in st.session_state:
+        st.session_state['session_id'] = str(uuid.uuid4())[:8]
+    return st.session_state['session_id']
+def get_session_dir():
+    """Gets the temporary directory path for the current session."""
+    session_id = get_session_id()
+    return os.path.join(TEMP_DIR, session_id)
+def get_temp_dir():
+    """Creates and returns the path to a temporary directory for processing."""
+    temp_dir = Path(CONFIG['paths']['temp_dir'])
+    ensure_dir(temp_dir)
+    # Consider using unique subdirs per run if needed
+    # processing_subdir = tempfile.mkdtemp(dir=temp_dir)
+    # return processing_subdir
+    return str(temp_dir)  # Return as string for wider compatibility
+def extract_audio(video_path, audio_format="wav"):
+    """Extracts audio from video using moviepy."""
+    try:
+        session_dir = os.path.dirname(video_path) # Assumes video is in session dir
+        base_name = os.path.splitext(os.path.basename(video_path))[0]
+        audio_filename = f"{base_name}_audio.{audio_format}"
+        audio_path = os.path.join(session_dir, audio_filename)
+        if os.path.exists(audio_path):
+            print(f"Audio file already exists: {audio_path}")
+            return audio_path
+        print(f"Extracting audio from {video_path} to {audio_path}...")
+        video_clip = VideoFileClip(video_path)
+        audio_clip = video_clip.audio
+        if audio_clip is None:
+            print("No audio track found in the video.")
+            video_clip.close()
+            return None
+        audio_clip.write_audiofile(audio_path, codec='pcm_s16le' if audio_format == 'wav' else 'mp3')  # WAV is often better for STT
+        audio_clip.close()
+        video_clip.close()
+        print("Audio extraction complete.")
+        return audio_path
+    except Exception as e:
+        print(f"Error extracting audio: {e}")
+        # Clean up potentially corrupted file
+        if 'audio_clip' in locals() and audio_clip:
+            audio_clip.close()
+        if 'video_clip' in locals() and video_clip:
+            video_clip.close()
+        # Attempt to remove partial file if creation failed mid-way
+        if os.path.exists(audio_path):
+            try:
+                os.remove(audio_path)
+            except OSError as rm_e:
+                print(f"Could not remove partial audio file {audio_path}: {rm_e}")
+        return None
+from scenedetect import open_video, SceneManager
+from scenedetect.detectors import ContentDetector
+def extract_frames_pyscenedetect(video_path, output_dir, threshold=2.0):
+    # session_dir = os.path.dirname(video_path)
+    # frames_dir = os.path.join(session_dir, 'frames_pyscenedetect')
+    # os.makedirs(frames_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)  # ensure the output dir exists
+    # Init video- and scene- managers
+    # video_manager = VideoManager([video_path])
+    video = open_video(video_path)
+    scene_manager = SceneManager()
+    scene_manager.add_detector(ContentDetector(threshold=threshold))
+    # Start analysis
+    # video_manager.set_downscale_factor()
+    # video_manager.start()
+    # scene_manager.detect_scenes(frame_source=video_manager)
+    scene_manager.detect_scenes(video)
+    print(scene_manager.get_scene_list())
+    # Get the scene list
+    scene_list = scene_manager.get_scene_list()
+    print(f'Обнаружено {len(scene_list)} смен сцен.')
+    # Save the scenes switch frames
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f'Error: Could not open video file {video_path}')
+        return None
+    extracted_frame_paths = []
+    for i, (start_time, _) in enumerate(scene_list):
+        frame_num = start_time.get_frames()
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
+        success, frame = cap.read()
+        if success:
+            timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
+            # frame_filename = f'scene_{i + 1:03d}.jpg'
+            frame_filename = f'frame_{int(timestamp_ms / 1000):06d}.png'  # naming by seconds
+            frame_path = os.path.join(output_dir, frame_filename)
+            cv2.imwrite(frame_path, frame)
+            print(f'[*] Сохранён кадр {frame_num} в {frame_path}')
+            extracted_frame_paths.append(frame_path)
+        else:
+            print(f'[!] Ошибка при чтении кадра {frame_num}')
+    cap.release()
+    return output_dir, extracted_frame_paths
+    print(f'Extracted {len(extracted_frame_paths)} frames to {output_dir}.')
+    return output_dir, extracted_frame_paths
+def extract_frames_interval(video_path, output_dir, interval_sec=5):
+    '''Extracts frames from video at specified intervals using OpenCV.'''
+    try:
+        # session_dir = os.path.dirname(video_path)
+        # frames_dir = os.path.join(session_dir, 'frames_interval')
+        # os.makedirs(frames_dir, exist_ok=True)
+        os.makedirs(output_dir, exist_ok=True)  # ensure the output dir exists
+        print(f'Extracting frames from {video_path} every {interval_sec}s..')
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            print(f'Error: Could not open video file {video_path}')
+            return None
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        if fps == 0:
+            print('Warning: Could not get FPS, defaulting to 30.')
+            fps = 30  # provide a default if FPS is not available
+        frame_interval = int(fps * interval_sec)
+        frame_count = 0
+        extracted_frame_paths = []
+        def extract_frame():
+            timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
+            frame_filename = f'frame_{int(timestamp_ms / 1000):06d}.png'  # naming by seconds
+            frame_path = os.path.join(output_dir, frame_filename)
+            cv2.imwrite(frame_path, frame)
+            extracted_frame_paths.append(frame_path)
+        success = True
+        while success:
+            if frame_count % frame_interval == 0:
+                success, frame = cap.read()
+                if success:
+                    extract_frame()
+            else:
+                # Skip frames efficiently without decoding
+                for _ in range(frame_interval - 1):
+                    success = cap.grab()
+                    if not success:
+                        break
+                    frame_count += 1
+                # Now read the desired frame if grab was successful
+                if success:
+                    success, frame = cap.retrieve()
+                    if success:
+                        extract_frame()
+                    else:
+                        # Handle case where retrieve fails after grab
+                        print(f'Warning: Failed to retrieve frame after grab at frame count {frame_count}')
+            frame_count += 1
+        cap.release()
+        print(f'Extracted {len(extracted_frame_paths)} frames to {output_dir}.')
+        return output_dir, extracted_frame_paths
+    except Exception as e:
+        print(f'Error extracting frames: {e}')
+        if 'cap' in locals() and cap.isOpened():
+            cap.release()
+        return None, []
+# --- Add other potential helpers: yt-dlp download, file cleanup etc. ---
+def download_youtube(url, output_dir):
+    """Downloads YouTube video using yt-dlp."""
+    import yt_dlp
+    ydl_opts = {
+        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
+        'outtmpl': os.path.join(output_dir, '%(title)s.%(ext)s'),
+        'noplaylist': True,  # download only single video if URL is part of playlist
+        'progress_hooks': [lambda d: print(d['status'])]  # basic progress
+    }
+    try:
+        print(f'Attempting to download YouTube video: {url}')
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=True)
+            # Try to get the downloaded filename
+            filename = ydl.prepare_filename(info)
+            print(f"YouTube video downloaded to: {filename}")
+            return filename
+    except Exception as e:
+        print(f"Error downloading YouTube video: {e}")
+        return None
+def cleanup_session_files(session_id):
+    """Removes the temporary directory for a given session."""
+    session_dir = os.path.join(TEMP_DIR, session_id)
+    if os.path.exists(session_dir):
+        import shutil
+        try:
+            shutil.rmtree(session_dir)
+            print(f"Cleaned up temporary files for session: {session_id}")
+        except Exception as e:
+            print(f"Error cleaning up session files {session_dir}: {e}")
+###
+###=== Audio Loading and Processing
+###
+SAMPLE_RATE = 22050
+DURATION = 5
+n_mfcc = 13  # number of MFCCs to extract from each sample
+n_mels = 128
+n_fft = 2048
+hop_length = 512
+delta_width = 9  # MFCC Delta parameter
+def trim_silence(sound, s_thresh=-28.0):
+    '''Trims silent chunks from beginning and end of the sound'''
+    duration = len(sound)
+    start_trim = detect_leading_silence(sound, s_thresh)
+    end_trim = detect_leading_silence(sound.reverse(), s_thresh)
+    start = start_trim if start_trim != duration else None
+    end = duration - end_trim if end_trim != duration else None
+    return sound[start:end]
+def normalize_volume(sound, target_dBFS=-20.0):
+    '''Normalizes sound and shifts to specified loudness'''
+    sound = sound.normalize()
+    difference = target_dBFS - sound.dBFS
+    return sound.apply_gain(difference)
+def proc_raw_audio(audio_data, from_start=0, duration=None, before_end=0):
+    '''Processes raw audio data and return wav and numpy arrays'''
+    # Instanciate pydub AudioSegment object from raw audio
+    audioObj = AudioSegment.from_file(BytesIO(audio_data))
+    # Convert to mono mode with the desired sample rate
+    audioObj = audioObj.set_frame_rate(SAMPLE_RATE).set_channels(1)
+    # Normalize audio volume
+    audioObj = normalize_volume(audioObj)
+    # Trim by removing silence from beginning and end of the sound
+    audioObj = trim_silence(audioObj)
+    # Cut to the desired duration
+    start = from_start * 1000
+    if duration:
+        end = start + duration * 1000
+    else:
+        end = len(audioObj) - before_end * 1000
+    audioObj = audioObj[start:end]
+    # Convert AudioSegment to wav format instance
+    buf = BytesIO()
+    audioObj.export(buf, format='wav')
+    audio_wav = buf.getvalue()
+    # Convert the AudioSegment to signal in form of numpy.array
+    arr = audioObj.get_array_of_samples()
+    audio_np = np.array(arr, dtype='float')
+    # Normalize if specified
+    # if normalized:
+    #     audio_np = np.array(arr) / np.iinfo(arr.typecode).max
+    #     y /= np.linalg.norm(y)
+    # return y, sample_rate
+    return audio_wav, audio_np
+###==============================================
+def obtain_features(y, sr=22050, duration=5, delta_width=9):
+    '''Extracts sound features from given signal and returns them as a numpy array'''
+    # --- MFCC (returns M: np.ndarray [shape=(n_mfcc, t)])
+    mfcc = librosa.feature.mfcc(y, sr,
+                                n_mfcc=n_mfcc, n_mels=n_mels,
+                                n_fft=n_fft, hop_length=hop_length)
+    return mfcc
+def create_features_array(mfcc):#, mfcc_delta1, mfcc_delta2, spectr_c, spectr_r):
+    '''Creates wholistic numpy array of means and variances out of given features'''
+    make_meanvar = lambda mean, var: [item for mv in zip(mean, var) for item in mv]
+    mean_var_ops = [
+        (mfcc.mean(axis=1), mfcc.var(axis=1))
+    ]
+    mfcc_meanvars = sum([make_meanvar(mean, var)
+                         for mean, var in mean_var_ops], [])
+    # features_array = mfcc_meanvars + spectr_meanvars
+    features_array = [mfcc_meanvars]
+    return features_array
+# def get_features(y, sr=22050, duration=5, delta_width=9):
+#     '''Returns numpy array of sound features obtained from signal'''
+#     return create_features_array(*obtain_features(y, sr, duration, delta_width))
+def get_features(y, duration=5, sr=SAMPLE_RATE):
+    '''Returns numpy array of sound features obtained from signal'''
+    fig, axes = plt.subplots(1, 2, figsize=(24, 2))
+    # WAVE PLOT
+    axes[0].set_title(f'Wave Plot for audio sample at {sr} hz')
+    axes[0].set_facecolor('#B4E8CF')
+    lbd.waveshow(y, sr=sr, color='#4300FF', ax=axes[0])
+    # MELSPEC
+    melspec = librosa.feature.melspectrogram(y=y, sr=sr)
+    melspec = librosa.power_to_db(np.abs(melspec), ref=np.max)
+    axes[1].set_title(f'Mel Spectogram | shape: {melspec.shape}')
+    lbd.specshow(melspec, cmap='viridis', y_axis='mel', x_axis='time', ax=axes[1])
+    st.pyplot(fig)
+    pad_signal = lambda s, v: np.pad(
+        s,
+        [(0, 0), (0, max(0, 216 - s.shape[1]))],
+        constant_values=v
+    )
+    # Prepare melspec for use
+    melspec = pad_signal(melspec, melspec.min())
+    melspec = melspec.reshape(1, *melspec.shape)
+    # MFCC
+    # mfcc = create_features_array(obtain_features(y, sr, duration, delta_width))
+    # mfcc = np.array(mfcc).reshape(1, -1)
+    return melspec
+    # return mfcc