TensorVizion commited on
Commit
4b61207
·
verified ·
1 Parent(s): 3409101

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +101 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import re
4
+ from huggingface_hub import InferenceClient
5
+
6
+ # Replace this with your exact model repo ID
7
+ # e.g., "tensorvizion/O-wen-4.6"
8
+ MODEL_ID = "tensorvizion/O-wen-4.6"
9
+
10
+ # Initialize the HF inference client
11
+ # Note: In a private Space, you would pass token=os.environ["HF_TOKEN"]
12
+ client = InferenceClient(model=MODEL_ID)
13
+
14
+ def extract_data(raw_text, fields_to_extract):
15
+ if not raw_text.strip() or not fields_to_extract.strip():
16
+ return {"error": "Please provide both raw text and fields to extract."}
17
+
18
+ # Construct the system instruction for O-wen 4.6
19
+ system_prompt = (
20
+ "You are an expert data extraction assistant. Your job is to extract specific "
21
+ "information from messy, unstructured text and output it as clean, valid JSON.\n"
22
+ "Rules:\n"
23
+ "1. Only extract the fields requested.\n"
24
+ "2. If a field is not found in the text, return 'null' for that field.\n"
25
+ "3. Output ONLY a raw JSON object. Do not include markdown formatting, backticks, or conversational text."
26
+ )
27
+
28
+ user_prompt = f"Fields to extract:\n{fields_to_extract}\n\nUnstructured Text:\n{raw_text}"
29
+
30
+ messages = [
31
+ {"role": "system", "content": system_prompt},
32
+ {"role": "user", "content": user_prompt}
33
+ ]
34
+
35
+ try:
36
+ # Call O-wen 4.6 via the chat completion API
37
+ response = client.chat_completion(
38
+ messages=messages,
39
+ max_tokens=1024,
40
+ temperature=0.1, # Low temperature for more deterministic/factual data extraction
41
+ )
42
+
43
+ output_text = response.choices[0].message.content.strip()
44
+
45
+ # Fallback: Sometimes models still wrap JSON in markdown backticks (```json ...
46
+ ```)
47
+ # This regex strips the markdown so the json.loads() doesn't crash
48
+ json_match = re.search(r'```json\n(.*?)\n```', output_text, re.DOTALL)
49
+ if json_match:
50
+ output_text = json_match.group(1)
51
+
52
+ # Parse the text into an actual JSON dictionary for the Gradio UI
53
+ structured_data = json.loads(output_text)
54
+ return structured_data
55
+
56
+ except json.JSONDecodeError:
57
+ return {
58
+ "error": "The model failed to return valid JSON. It returned this instead:",
59
+ "raw_output": output_text
60
+ }
61
+ except Exception as e:
62
+ return {"error": str(e)}
63
+
64
+ # -------------------------
65
+ # Build the Gradio UI
66
+ # -------------------------
67
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
68
+ gr.Markdown("# 🛟 The Data Rescuer")
69
+ gr.Markdown(f"**Powered by `{MODEL_ID}`** | Turn messy transcripts, notes, and OCR text into clean JSON data.")
70
+
71
+ with gr.Row():
72
+ # Left Column: Inputs
73
+ with gr.Column():
74
+ raw_input = gr.Textbox(
75
+ label="1. Paste Unstructured Text",
76
+ placeholder="Paste your messy meeting notes, emails, or raw text here...",
77
+ lines=12
78
+ )
79
+
80
+ schema_input = gr.Textbox(
81
+ label="2. What fields do you want to extract?",
82
+ placeholder="e.g., Company Name, Contact Person, Deadline, Action Items (list)",
83
+ lines=3
84
+ )
85
+
86
+ extract_btn = gr.Button("Extract Structured Data", variant="primary")
87
+
88
+ # Right Column: Output
89
+ with gr.Column():
90
+ json_output = gr.JSON(label="Structured Output")
91
+
92
+ # Connect the button to the function
93
+ extract_btn.click(
94
+ fn=extract_data,
95
+ inputs=[raw_input, schema_input],
96
+ outputs=json_output
97
+ )
98
+
99
+ # Launch the app
100
+ if __name__ == "__main__":
101
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ huggingface_hub