| import gradio as gr |
|
|
| from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, OpenAIEmbeddings |
| from langchain.vectorstores import Pinecone |
| import pinecone |
| import os |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
| PINECONE_KEY = os.environ.get("PINECONE_KEY", "") |
| PINECONE_ENV = os.environ.get("PINECONE_ENV", "asia-northeast1-gcp") |
| PINECONE_INDEX = os.environ.get("PINECONE_INDEX", '3gpp-r16') |
|
|
| EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "hkunlp/instructor-large") |
| EMBEDDING_LOADER = os.environ.get("EMBEDDING_LOADER", "HuggingFaceInstructEmbeddings") |
| EMBEDDING_LIST = ["HuggingFaceInstructEmbeddings", "HuggingFaceEmbeddings"] |
|
|
| |
| TOP_K_DEFAULT = 15 |
| TOP_K_MAX = 30 |
| SCORE_DEFAULT = 0.33 |
|
|
| global g_db |
| g_db = None |
|
|
| def init_db(emb_name, emb_loader, db_api_key, db_env, db_index): |
|
|
| embeddings = eval(emb_loader)(model_name=emb_name) |
|
|
| pinecone.init(api_key = db_api_key, |
| environment = db_env) |
|
|
| global g_db |
|
|
| g_db = Pinecone.from_existing_index(index_name = db_index, |
| embedding = embeddings) |
| return str(g_db) |
|
|
|
|
| def get_db(): |
| return g_db |
|
|
|
|
| def remove_duplicates(documents, score_min): |
| seen_content = set() |
| unique_documents = [] |
| for (doc, score) in documents: |
| if (doc.page_content not in seen_content) and (score >= score_min): |
| seen_content.add(doc.page_content) |
| unique_documents.append(doc) |
| return unique_documents |
|
|
|
|
| def get_data(query, top_k, score): |
| if not query: |
| return "Please init db in configuration" |
|
|
| print("Use db: " + str(g_db)) |
|
|
| docs = g_db.similarity_search_with_score(query = query, |
| k=top_k) |
| |
| |
| udocs = remove_duplicates(docs, score) |
| return udocs |
|
|
| with gr.Blocks( |
| title = "3GPP Database", |
| theme = "Base", |
| css = """.bigbox { |
| min-height:250px; |
| } |
| """) as demo: |
| with gr.Tab("Matching"): |
| with gr.Accordion("Vector similarity"): |
| with gr.Row(): |
| with gr.Column(): |
| top_k = gr.Slider(1, |
| TOP_K_MAX, |
| value=TOP_K_DEFAULT, |
| step=1, |
| label="Vector similarity top_k", |
| interactive=True) |
| with gr.Column(): |
| score = gr.Slider(0.01, |
| 0.99, |
| value=SCORE_DEFAULT, |
| step=0.01, |
| label="Vector similarity score", |
| interactive=True) |
|
|
| with gr.Row(): |
| inp = gr.Textbox(label = "Input", |
| placeholder="What are you looking for?") |
| out = gr.Textbox(label = "Output") |
|
|
| btn_run = gr.Button("Run", variant="primary") |
|
|
| with gr.Tab("Configuration"): |
| with gr.Row(): |
| loading = gr.Textbox(get_db, max_lines=1, show_label=False) |
| btn_init = gr.Button("Init") |
| with gr.Accordion("Embedding"): |
| with gr.Row(): |
| with gr.Column(): |
| emb_textbox = gr.Textbox( |
| label = "Embedding Model", |
| |
| value = EMBEDDING_MODEL, |
| placeholder = "Paste Your Embedding Model Repo on HuggingFace", |
| lines=1, |
| interactive=True, |
| type='email') |
|
|
| with gr.Column(): |
| emb_dropdown = gr.Dropdown( |
| EMBEDDING_LIST, |
| value=EMBEDDING_LOADER, |
| multiselect=False, |
| interactive=True, |
| label="Embedding Loader") |
|
|
| with gr.Accordion("Pinecone Database"): |
| with gr.Row(): |
| db_api_textbox = gr.Textbox( |
| label = "Pinecone API Key", |
| |
| value = PINECONE_KEY, |
| placeholder = "Paste Your Pinecone API Key (xx-xx-xx-xx-xx) and Hit ENTER", |
| lines=1, |
| interactive=True, |
| type='password') |
| with gr.Row(): |
| db_env_textbox = gr.Textbox( |
| label = "Pinecone Environment", |
| |
| value = PINECONE_ENV, |
| placeholder = "Paste Your Pinecone Environment (xx-xx-xx) and Hit ENTER", |
| lines=1, |
| interactive=True, |
| type='email') |
| db_index_textbox = gr.Textbox( |
| label = "Pinecone Index", |
| |
| value = PINECONE_INDEX, |
| placeholder = "Paste Your Pinecone Index (xxxx) and Hit ENTER", |
| lines=1, |
| interactive=True, |
| type='email') |
|
|
| btn_init.click(fn=init_db, inputs=[emb_textbox, emb_dropdown, db_api_textbox, db_env_textbox, db_index_textbox], outputs=loading) |
| btn_run.click(fn=get_data, inputs=[inp, top_k, score], outputs=out) |
|
|
| if __name__ == "__main__": |
| demo.queue() |
| demo.launch(inbrowser = True) |
|
|