| | import streamlit as st
|
| | import re
|
| |
|
| |
|
| | def recursive_splitter(data):
|
| | paragraphs = data.split('\n\n')
|
| | sentences = [sentence for para in paragraphs for sentence in para.split('.')]
|
| | return [sentence.strip() + '.' for sentence in sentences if sentence.strip()]
|
| |
|
| | def html_splitter(data):
|
| | parts = re.split(r'(<[^>]+>)', data)
|
| | return [part for part in parts if part.strip()]
|
| |
|
| | def markdown_splitter(data):
|
| | parts = re.split(r'(^#{1,6} .*$)', data, flags=re.MULTILINE)
|
| | return [part.strip() for part in parts if part.strip()]
|
| |
|
| | def code_splitter(data):
|
| | parts = re.split(r'(?m)^def ', data)
|
| | return [f'def {part.strip()}' if idx > 0 else part.strip() for idx, part in enumerate(parts) if part.strip()]
|
| |
|
| | def token_splitter(data):
|
| | tokens = re.findall(r'\b\w+\b', data)
|
| | return tokens
|
| |
|
| | def character_splitter(data):
|
| | return list(data)
|
| |
|
| | def semantic_chunker(data):
|
| | sentences = re.split(r'(?<=\.)\s+', data)
|
| | return [sentence.strip() for sentence in sentences if sentence.strip()]
|
| |
|
| |
|
| | splitter_details = {
|
| | "Recursive Splitter": {
|
| | "function": recursive_splitter,
|
| | "description": "Recursively splits the data into smaller chunks, like paragraphs into sentences. Useful for processing text at different levels of granularity."
|
| | },
|
| | "HTML Splitter": {
|
| | "function": html_splitter,
|
| | "description": "Splits data based on HTML tags, making it easier to work with structured web content, such as isolating specific sections of HTML code."
|
| | },
|
| | "Markdown Splitter": {
|
| | "function": markdown_splitter,
|
| | "description": "Splits markdown content based on headings (e.g., '# ', '## '). Useful for processing documents written in Markdown format."
|
| | },
|
| | "Code Splitter": {
|
| | "function": code_splitter,
|
| | "description": "Splits programming code into logical blocks like functions or classes. Useful for code analysis and documentation."
|
| | },
|
| | "Token Splitter": {
|
| | "function": token_splitter,
|
| | "description": "Splits data into individual tokens/words, which is often the first step in natural language processing (NLP) tasks."
|
| | },
|
| | "Character Splitter": {
|
| | "function": character_splitter,
|
| | "description": "Splits text into individual characters. Useful for character-level analysis or encoding tasks."
|
| | },
|
| | "Semantic Chunker": {
|
| | "function": semantic_chunker,
|
| | "description": "Splits data based on semantic meaning, typically by sentences. Ensures that related information stays together."
|
| | },
|
| | }
|
| |
|
| |
|
| | st.sidebar.title("Splitter Settings")
|
| | st.sidebar.subheader("Data Input")
|
| | user_data = st.sidebar.text_area("Enter the data you want to split:", "This is a sample text. Enter your data here...")
|
| |
|
| | st.sidebar.subheader("Splitter Type")
|
| | splitter_type = st.sidebar.selectbox(
|
| | "Choose a splitter type:",
|
| | list(splitter_details.keys())
|
| | )
|
| |
|
| | st.sidebar.subheader("Options")
|
| | show_info = st.sidebar.checkbox("Show information about all splitter types")
|
| |
|
| | st.title("RAG Splitter System")
|
| | st.markdown('<p class="title">Developed By: Irfan Ullah Khan</p>', unsafe_allow_html=True)
|
| |
|
| |
|
| | st.subheader(f"Selected Splitter: {splitter_type}")
|
| | st.write(splitter_details[splitter_type]["description"])
|
| |
|
| |
|
| | if st.button("Split Data"):
|
| | with st.spinner('Processing data...'):
|
| | splitter_function = splitter_details[splitter_type]["function"]
|
| | split_output = splitter_function(user_data)
|
| |
|
| | if split_output:
|
| | st.subheader(f"Output using {splitter_type}")
|
| | for idx, part in enumerate(split_output):
|
| | st.write(f"**Part {idx + 1}:**")
|
| | st.write(part)
|
| |
|
| | if show_info:
|
| | for name, details in splitter_details.items():
|
| | st.subheader(name)
|
| | st.write(details["description"])
|
| |
|