ChromaDB Client

2024-06-11 18:38:12 +02:00 · 2024-06-11 18:38:12 +02:00 · ed0107e99c
commit ed0107e99c
1 changed files with 277 additions and 0 deletions
--- a/chroma.py
+++ b/chroma.py
@ -0,0 +1,277 @@
+import chromadb
+from chromadb.config import Settings
+import requests
+import html2text
+import pandas as pd
+import logging
+import re
+import urllib
+import os
+
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.schema.document import Document
+#from langchain.vectorstores.chroma import Chroma
+
+logging.basicConfig(filename='chroma.log', level=logging.DEBUG)
+
+#constants
+collection_name = "janet_knowledge"
+
+POSTS_URL = "https://api.d4science.org/rest/2/posts/get-posts-vre/"
+GCAT_URL = "https://api.d4science.org/catalogue/items/"
+PDF_PATH = "./PDF"
+TXT_PATH = "./TXT"
+headers = {"gcube-token": "2c1e8f88-461c-42c0-8cc1-b7660771c9a3-843339462", "Accept": "application/json"}
+
+docs = {}
+
+
+"""
+Starting the DB:
+docker pull chromadb/chroma
+docker run -p 8000:8000 -e CHROMA_SERVER_AUTHN_CREDENTIALS_PROVIDER="chromadb.authn.token.TokenConfigServerAuthCredentialsProvider" -e CHROMA_SERVER_AUTHN_PROVIDER="chromadb.authn.token.TokenAuthenticationServerProvider"  -e CHROMA_SERVER_AUTHN_CREDENTIALS="myToken" -e CHROMA_SERVER_AUTHN_TOKEN_TRANSPORT_HEADER="X_CHROMA_TOKEN" -v /Users/ahmed/Desktop/chroma/:/chroma/chroma chromadb/chroma
+"""
+
+client = chromadb.HttpClient(
+    host="localhost",
+    port=8000,
+    headers={"Authorization": "Bearer myToken"}
+    #settings=Settings(chroma_client_auth_provider="token", chroma_client_auth_credentials="myToken"),
+)
+collection = client.get_or_create_collection(name=collection_name)
+
+ 
+def getFilename_fromCd(cd):
+    if not cd:  
+        return None
+    fname = re.findall('filename=(.+)', cd)
+    if len(fname) == 0:
+        return None
+    return fname[0]
+
+def load_documents():
+    document_loader = PyPDFDirectoryLoader(PDF_PATH)
+    logging.debug("loading docs")
+    return document_loader.load()
+
+
+def split_documents(documents):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=800,
+        chunk_overlap=80,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    logging.debug(f"splitting {len(documents)} documents")
+    return text_splitter.split_documents(documents)
+
+def calculate_chunk_ids(chunks):
+
+    # This will create IDs like "data/monopoly.pdf:6:2"
+    # Page Source : Page Number : Chunk Index
+
+    last_page_id = None
+    current_chunk_index = 0
+
+    for chunk in chunks:
+        source = chunk.metadata.get("source")
+        page = chunk.metadata.get("page")
+        current_page_id = f"{source}:{page}"
+        logging.debug(f"chunking {source} page {page}")
+
+        # If the page ID is the same as the last one, increment the index.
+        if current_page_id == last_page_id:
+            current_chunk_index += 1
+        else:
+            current_chunk_index = 0
+
+        # Calculate the chunk ID.
+        chunk_id = f"{current_page_id}:{current_chunk_index}"
+        last_page_id = current_page_id
+
+        # Add it to the page meta-data.
+        chunk.metadata["id"] = chunk_id
+        #logging.debug(docs[source])
+        #chunk.metadata["extras"] = docs["./"+source]['metadata']  #needs primitive types
+    logging.debug(f"we have {len(chunks)} chunks")
+    return chunks
+
+
+def add_to_chroma(chunks):
+    # Calculate Page IDs.
+    chunks_with_ids = calculate_chunk_ids(chunks)
+
+    # Add or Update the documents.
+    existing_items = collection.get(include=[])  # IDs are always included by default
+    existing_ids = set(existing_items["ids"])
+    logging.debug(f"Number of existing documents in DB: {len(existing_ids)}")
+
+    # Only add documents that don't exist in the DB.
+    new_chunks = []
+    for chunk in chunks_with_ids:
+        if chunk.metadata["id"] not in existing_ids:
+            new_chunks.append(chunk)
+
+    if len(new_chunks):
+        logging.debug(f"Adding new documents: {len(new_chunks)}")
+        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
+        chunk_metas = [chunk.metadata for chunk in new_chunks]
+        chunk_content = [chunk.page_content for chunk in new_chunks]
+        logging.debug(f"chunks {len(new_chunks)}")
+        logging.debug(f"chunk_id: {len(new_chunk_ids)} ")
+        collection.add(documents=chunk_content, 
+                       metadatas=chunk_metas,
+                       ids=new_chunk_ids)
+    else:
+        logging.debug("No new documents to add")
+ 
+
+
+def get_posts(url, headers):
+    h = html2text.HTML2Text()
+    h.ignore_links = True
+    try:
+        posts = requests.get(url, headers=headers)
+        posts = posts.json()['result']
+        for post in posts:
+            author = post['full_name'].lower()
+            key = post['key']
+            content = h.handle(post['description']).replace('\n', ' ').lower()
+            date = post['time']
+            tags = []
+            for word in content.split():
+              if word[0] == '#':
+                tags.append(word[1:])
+            logging.debug(f"""Adding post {id} whose author is {author} and content is {content} to the index.""")
+            filename = "./TXT/post-"+key+".txt"
+            open(filename, 'w').write(content)
+            #add_doc(filename, {'source': f"""post-{key}""", 'author': author, 'date': date, 'tags': tags}, key)
+    except Exception as e:
+        logging.error(e)
+
+def get_catalogue(url, headers):
+    try:
+        response = requests.get(url, headers=headers)
+        items = response.json()
+        items_data = []
+        for item in items:
+            api_url = url + item + '/'
+            logging.debug(f"Getting resources of {api_url}")
+            response = requests.get(api_url, headers=headers)
+            try:
+                items_data.append(response.json())
+            except Exception as err:
+                logging.error(err)
+                logging.error(f"response is {response}")
+
+        for item in items_data:
+            logging.debug(item)
+            for el in item['extras']:
+                if el['key'] == 'system:type':
+                    rsrc = el['value']
+                if el['key'] == 'Item URL':
+                    item_url = el['value']
+            tags = []
+            for tag in item['tags']:
+                tags.append(tag['name'].lower())
+            title = item['title'].lower()
+            author = item['author'].lower()
+            notes = item['notes'].lower()
+            date = item['metadata_created']
+            resources = []
+            for resource in item['resources']:
+                logging.debug(resource)
+                resources.append(
+                    {'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
+                if rsrc == 'Paper':
+                    r = requests.get(resource['url'], headers=headers)
+                    filename = ""
+                    if 'application/pdf' in r.headers.get('content-type'):
+                        filename = './PDF/'+item['title']+"-"+getFilename_fromCd(r.headers.get('content-disposition'))+".pdf"
+                        urllib.request.urlretrieve(resource['url'], filename)
+                    if 'text/plain' in r.headers.get('content-type'):
+                        filename = './TXT/'+item['title']+"-"+getFilename_fromCd(r.headers.get('content-disposition'))+".txt"
+                        urllib.request.urlretrieve(resource['url'], filename)
+                    add_doc(filename, {'source': f"paper-{title}", 'author': author, 'date': date, 'tags': tags}, filename)
+                    filename = "./TXT/paper-"+title+"-desc.txt"
+                    open(filename, 'w').write(resource['description'])
+                    add_doc(filename, {'source': f"paper-{title}", 'author': author, 'date': date, 'tags': tags}, filename)
+                if rsrc == 'Dataset':
+                    filename = "./TXT/dataset-"+title+".txt"
+                    open(filename, 'w').write(resource['description'])
+                    add_doc(filename, {'source': f"dataset-{title}", 'author': author, 'date': date, 'tags': tags}, filename)
+                    
+    except Exception as e:
+        logging.error(e)
+
+def clear_dir():
+    try:
+        for dir in ['./PDF', './TXT']:
+            files = os.listdir(dir)
+            for file in files:
+                file_path = os.path.join(dir, file)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+            logging.debug("All files deleted successfully.")
+    except Exception as e:
+        logging.error("Error occurred while deleting files.")
+        logging.error(e)
+
+def create_dir(dir):
+    if not os.path.exists(dir):
+        os.makedirs(dir)
+
+def add_doc(filename, metadata, id):
+    docs[id] = {'filename': filename, 'metadata': metadata}
+    logging.debug(f"checking {docs[id]}")
+
+
+def main():
+    #clear_dir()
+    #create_dir("./PDF/")
+    #create_dir("./TXT")
+    #get_posts(POSTS_URL, headers=headers)
+    #get_catalogue(GCAT_URL, headers=headers)
+
+    #documents = load_documents()
+    #chunks = split_documents(documents)
+    #add_to_chroma(chunks)
+
+    results = collection.query(
+        query_texts=["who is leondardo candela?"],
+        n_results=3
+    )
+    print(results)
+
+
+if __name__ == "__main__":
+    main()
+#clear_dir()
+
+#collection.add(
+#    documents=["This is a document about cat", "This is a document about car"],
+#    metadatas=[{"category": "animal"}, {"category": "vehicle"}],
+#    ids=["id1", "id2"]
+#)
+
+#results = collection.query(
+#    query_texts=["vehicle"],
+#    n_results=1
+#)
+#print(results)
+
+
+
+#def main():
+    #get stuff from the VRE
+    #add if not there
+    #while True
+    #receive nots from vre and add new additions
+
+
+
+
+
+#if __name__ == "__main__":
+#    main()