Spaces:
Running
Running
| import streamlit as st | |
| import tempfile | |
| import os | |
| from pdf2markdown4llm import PDF2Markdown4LLM | |
| st.set_page_config(page_title="PDF to Markdown Converter", layout="wide") | |
| st.title("PDF to Markdown Converter") | |
| st.write("Convert your PDF files to Markdown format") | |
| def progress_callback(progress): | |
| """Callback function to handle progress updates""" | |
| progress_bar.progress(progress.percentage / 100) | |
| status_text.text(f"Phase: {progress.phase.value}, Page {progress.current_page}/{progress.total_pages}\n" | |
| f"Progress: {progress.percentage:.1f}%, Message: {progress.message}") | |
| def format_markdown_for_preview(markdown_text): | |
| """Format markdown text for proper preview display""" | |
| # Ensure newlines are preserved by adding two spaces at the end of each line | |
| lines = markdown_text.split('\n') | |
| formatted_lines = [line + ' ' if line.strip() else line for line in lines] | |
| return '\n'.join(formatted_lines) | |
| # File upload | |
| uploaded_file = st.file_uploader("Select a PDF file", type=['pdf']) | |
| if uploaded_file is not None: | |
| # Configuration options | |
| with st.expander("Conversion Settings"): | |
| remove_headers = st.checkbox("Remove Headers", value=False) | |
| skip_empty_tables = st.checkbox("Skip Empty Tables", value=True) | |
| table_header = st.text_input("Table Header Format", value="### Table") | |
| if st.button("Start Conversion"): | |
| # Initialize progress bar and status text | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| try: | |
| # Create temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file: | |
| tmp_file.write(uploaded_file.getvalue()) | |
| tmp_file_path = tmp_file.name | |
| # Initialize converter | |
| converter = PDF2Markdown4LLM( | |
| remove_headers=remove_headers, | |
| skip_empty_tables=skip_empty_tables, | |
| table_header=table_header, | |
| progress_callback=progress_callback | |
| ) | |
| # Perform conversion | |
| markdown_content = converter.convert(tmp_file_path) | |
| # Clean up temporary file | |
| os.unlink(tmp_file_path) | |
| # Display results | |
| st.success("Conversion completed successfully!") | |
| # Raw markdown download | |
| st.download_button( | |
| label="Download Markdown File", | |
| data=markdown_content, | |
| file_name="converted.md", | |
| mime="text/markdown" | |
| ) | |
| # Preview with proper formatting | |
| st.subheader("Preview") | |
| # Create tabs for different preview modes | |
| preview_tab, raw_tab = st.tabs(["Formatted Preview", "Raw Markdown"]) | |
| with preview_tab: | |
| formatted_content = format_markdown_for_preview(markdown_content) | |
| st.markdown(formatted_content) | |
| with raw_tab: | |
| st.code(markdown_content, language="markdown") | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| if 'progress_bar' in locals(): | |
| progress_bar.empty() | |
| if 'status_text' in locals(): | |
| status_text.empty() |