trial

2023-04-21 04:08:30 +02:00 · 2023-04-21 04:08:30 +02:00 · 019281a217
parent 4a28e90975
commit 019281a217
4 changed files with 62 additions and 60 deletions
--- a/6
+++ b/6
@ -2,12 +2,12 @@ FROM python:3.8.10-slim

 WORKDIR /backend_janet 

-COPY requirements_simple.txt .
+COPY requirements_main.txt .

-RUN pip install -r requirements_simple.txt
+RUN pip install -r requirements_main.txt

 RUN rm -fr /root/.cache/*

 COPY . .

-ENTRYPOINT ["python", "main_simple.py"]
+ENTRYPOINT ["python", "main.py"]
--- a/ResponseGenerator.py
+++ b/ResponseGenerator.py
@ -5,6 +5,7 @@ from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 import pandas as pd
 from datetime import datetime
+from datasets import Dataset

 class ResponseGenerator:
    def __init__(self, index, db,recommender,generators, retriever, num_retrieved=3):
@ -81,11 +82,17 @@ class ResponseGenerator:
            return []
  
    def _search_index(self, index_type, db_type, query, multi=False):
-        xq = self.retriever.encode([query])
-        D, I = self.index[index_type].search(xq, self.num_retrieved)
+        self.index[index_type].add_faiss_index(column="embeddings")
+        scores, samples = self.index[index_type].get_nearest_examples(
+            "embeddings", retriever.encode([query]), k=3
+        )
+        samples_df = pd.DataFrame.from_dict(samples)
+        samples_df["scores"] = scores
+        samples_df.sort_values("scores", ascending=False, inplace=True)
+
        if multi:
-            return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True)
-        return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True).loc[0]
+            return samples_df.reset_index(drop=True)
+        return samples_df.iloc[0].reset_index(drop=True)


    def gen_response(self, action, utterance=None, name=None, username=None, vrename=None, state=None, consec_history=None, chitchat_history=None):
--- a/VRE.py
+++ b/VRE.py
@ -14,7 +14,7 @@ import urllib.request
 import time
 import threading
 import html2text
-
+from datasets import Dataset

 class VRE:
    def __init__(self, name, token, retriever, directory='/app/'):
@ -35,12 +35,12 @@ class VRE:
                   'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created', 'url']), 
                   'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']),
                   'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])}
-        self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'),
-                  'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'),
-                  'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'),
-                  'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'),
-                  'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'),
-                  'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')}
+        self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_titles_index'),
+                  'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else Dataset.load_from_disk(self.directory + 'janet_paper_titles_index'),
+                  'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_desc_index'),
+                  'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else Dataset.load_from_disk(self.directory + 'janet_paper_desc_index'),
+                  'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else Dataset.load_from_disk(self.directory + 'janet_content_index'),
+                  'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else Dataset.load_from_disk(self.directory + 'janet_post_index')}
        self.new_income = False

    def init(self):
@ -48,69 +48,59 @@ class VRE:
        self.get_content()
        if self.index['dataset_titles_index'] is None:
            self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
-            self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
+            #self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
                
        if self.index['dataset_desc_index'] is None:
            self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
-            self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
+            #self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
            
        if self.index['paper_titles_index'] is None:
            self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
-            self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
+            #self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
            
        if self.index['paper_desc_index'] is None:
            self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
-            self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
+            #self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
            
        if self.index['content_index'] is None:
            self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
-            self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
+            #self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
          
        if self.index['post_index'] is None:
            self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
-            self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
+            #self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')

    def index_periodic_update(self):
        if self.new_income:         
            if len(self.db['content_db'])%100 != 0:
                self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
-            self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
+            #self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
            if len(self.db['post_db'])%100 != 0:
                self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
-            self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
+            #self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
            if len(self.db['paper_db'])%100 != 0:
                self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
                self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
-            self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
-            self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
+            #self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
+            #self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
            if len(self.db['dataset_db'])%100 != 0:
                self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
                self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
-            self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')    
-            self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
+            #self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')    
+            #self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
            self.new_income = False

    def create_index(self, db_type, attribute, index_type, filename):
        filename = self.directory + filename
        to_index = self.db[db_type][attribute]
-        for i, info in enumerate(to_index):
-            if i == 0:
-                emb = self.retriever.encode([info])
-                sentence_embeddings = np.array(emb)
-            else:
-                emb = self.retriever.encode([info])
-                sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)

-        # number of partitions of the coarse quantizer = number of posting lists
-        # as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
-        nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
-        code_size = 8  # = number of subquantizers = number of sub-vectors
-        n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings)))  # n_bits of each code (8 -> 1 byte codes)
-        d = sentence_embeddings.shape[1]
-        coarse_quantizer = faiss.IndexFlatL2(d)  # will keep centroids of coarse quantizer (for inverted list)
-        self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
-        self.index[index_type].train(sentence_embeddings)  # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
-        faiss.write_index(self.index[index_type], filename)
+        dataset = Dataset.from_pandas(self.db[db_type])
+        embeddings_dataset = dataset.map(
+            lambda x: {"embeddings": self.retriever.encode([x[attribute]])[0]}
+        )
+        embeddings_dataset.save_to_disk(filename)
+        self.index[index_type] = embeddings_dataset
+        #faiss.write_index(self.index[index_type], filename)
    
    def populate_index(self, db_type, attribute, index_type, filename):
        filename = self.directory + filename
@ -185,15 +175,15 @@ class VRE:
               self.dataset_counter += 1
               dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url]

-        self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True)
-        self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True)
-        self.db['post_db'] = post_df.sort_values(by='time', ascending=True).reset_index(drop=True)
+        self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)
+        self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)
+        self.db['post_db'] = post_df.sort_values(by='time', ascending=True)
        
-        other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
+        #other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
        for i, post in post_df.iterrows():
-            if post['author'] != "Catalogue":
+            if post['author'] != "catalogue":
                self.content_counter+=1
-                other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
+                content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
        """
        for i, description in dataset_df.iterrows():
            self.content_counter+=1
@ -202,7 +192,7 @@ class VRE:
            self.content_counter+=1
            other_content_df.loc[str(self.content_counter)] = [self.content_counter, -3, description['title'] + ' is a paper. ' + description['notes'] + ' It is about ' + ', '.join(description['tags']) ]
        """    
-        self.db['content_db'] = pd.concat([content_df, other_content_df]).reset_index(drop=True)
+        self.db['content_db'] = content_df
        self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
        self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')

@ -282,17 +272,17 @@ class VRE:
                self.dataset_counter += 1
                dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url]

-        self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True)
-        self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True)
+        self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
+        self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
        
-        self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)]).reset_index(drop=True)
+        
+        self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)])
        self.db['post_db'].to_json(self.directory+self.name+'_post.json')
        
-        other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
        for i, post in post_df.iterrows():
-            if post['author'] != "Catalogue":
+            if post['author'] != "catalogue":
                self.content_counter+=1
-                other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
+                content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
        """
        for i, description in dataset_df.iterrows():
            self.content_counter+=1
@ -304,7 +294,7 @@ class VRE:

        self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
        self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
-        self.db['content_db'] = pd.concat([self.db['content_db'], content_df, other_content_df]).reset_index(drop=True)
+        self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
        self.db['content_db'].to_json(self.directory + self.name + '_content.json')
        if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty:
            self.new_income = True            
--- a/main.py
+++ b/main.py
@ -46,11 +46,16 @@ def vre_fetch():
        rg.update_index(vre.get_index())
        rg.update_db(vre.get_db())

+
 def user_interest_decay(token):
    while True:
-        print("decaying interests after 3 minutes for " + users[token]['username'])
-        time.sleep(180)
-        users[token]['user'].decay_interests()
+        if token in users:
+            print("decaying interests after 3 minutes for " + users[token]['username'])
+            time.sleep(180)
+            users[token]['user'].decay_interests()
+        else:
+             break
+

 def clear_inactive():
    while True: