| | import os |
| | import shutil |
| | import git |
| | from urllib.parse import urlparse |
| |
|
| | local_dir = os.getcwd() |
| | branch = None |
| |
|
| | |
| | def get_repo_name(url): |
| | parsed_url = urlparse(url) |
| | |
| | repo_name = os.path.basename(parsed_url.path) |
| | |
| | repo_name = repo_name[:-4] |
| | return repo_name |
| |
|
| | |
| | def clone_repo(url): |
| | try: |
| | path = os.path.join(local_dir,"staging",get_repo_name(url)) |
| | |
| | if os.path.exists(path): |
| | print(f"{get_repo_name(url)} already added in db") |
| | return False |
| | |
| | repo = git.Repo.clone_from(url,path) |
| | global branch |
| | branch = repo.head.reference |
| | print(f"{get_repo_name(url)} cloned succesfully") |
| | return True |
| | except Exception as e : |
| | print(f"Error cloning the git repository: {e}") |
| | return False |
| |
|
| | def delete_cloned_repo(url): |
| | local_path = os.path.join(local_dir,"staging",get_repo_name(url)) |
| | try: |
| | |
| | if os.path.exists(local_path): |
| | |
| | shutil.rmtree(local_path,ignore_errors=True) |
| | print(f"Repository at {local_path} successfully deleted.") |
| | else: |
| | print(f"Repository at {local_path} does not exist.") |
| | except Exception as e: |
| | print(f"Error deleting repository: {e}") |
| |
|
| | from langchain_community.document_loaders import GitLoader |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from langchain_community.vectorstores import Qdrant |
| | import qdrant_client |
| |
|
| | text_splitter = RecursiveCharacterTextSplitter( |
| | chunk_size = 1000, |
| | chunk_overlap = 20, |
| | ) |
| |
|
| | |
| | |
| |
|
| | client = qdrant_client.QdrantClient( |
| | os.getenv("QDRANT_HOST"), |
| | api_key=os.getenv("QDRANT_API_KEY") |
| | ) |
| |
|
| | from langchain_community.embeddings.fastembed import FastEmbedEmbeddings |
| | embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5") |
| | vectorstore = None |
| |
|
| | def load_repo(url): |
| | collection_config = qdrant_client.http.models.VectorParams( |
| | size=384, |
| | distance=qdrant_client.http.models.Distance.COSINE |
| | ) |
| |
|
| | client.recreate_collection( |
| | collection_name=get_repo_name(url), |
| | vectors_config=collection_config |
| | ) |
| | vectorstore = Qdrant( |
| | client=client, |
| | collection_name=get_repo_name(url), |
| | embeddings=embeddings |
| | ) |
| | print("collection created") |
| | try: |
| | loader = GitLoader(repo_path=os.path.join(local_dir,"staging",get_repo_name(url)), branch=branch, file_filter=lambda file_path: not file_path.endswith("package-lock.json"),) |
| | data = loader.load() |
| | chunks = text_splitter.split_documents(data) |
| | print("chunks created") |
| | vectorstore.add_documents(chunks) |
| | return True |
| | except Exception as e: |
| | print(f"Error loading and indexing repository: {e}") |
| | return False |
| | |
| | def repository_loader(url): |
| | result = False |
| | if(clone_repo(url)): |
| | result = load_repo(url) |
| | if result : |
| | delete_cloned_repo(url) |
| |
|
| |
|
| |
|
| | print('HELLO FROM CONTAINER') |
| | |
| |
|
| | |