diff --git a/Dockerfile b/Dockerfile index c0a0bb9..64f24c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,12 +2,12 @@ FROM python:3.8.10-slim WORKDIR /backend_janet -COPY requirements_simple.txt . +COPY requirements_main.txt . -RUN pip install -r requirements_simple.txt +RUN pip install -r requirements_main.txt RUN rm -fr /root/.cache/* COPY . . -ENTRYPOINT ["python", "main_simple.py"] +ENTRYPOINT ["python", "main.py"] diff --git a/ResponseGenerator.py b/ResponseGenerator.py index 6c2da8c..452c423 100644 --- a/ResponseGenerator.py +++ b/ResponseGenerator.py @@ -5,6 +5,7 @@ from sklearn.metrics.pairwise import cosine_similarity import numpy as np import pandas as pd from datetime import datetime +from datasets import Dataset class ResponseGenerator: def __init__(self, index, db,recommender,generators, retriever, num_retrieved=3): @@ -81,11 +82,17 @@ class ResponseGenerator: return [] def _search_index(self, index_type, db_type, query, multi=False): - xq = self.retriever.encode([query]) - D, I = self.index[index_type].search(xq, self.num_retrieved) + self.index[index_type].add_faiss_index(column="embeddings") + scores, samples = self.index[index_type].get_nearest_examples( + "embeddings", retriever.encode([query]), k=3 + ) + samples_df = pd.DataFrame.from_dict(samples) + samples_df["scores"] = scores + samples_df.sort_values("scores", ascending=False, inplace=True) + if multi: - return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True) - return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True).loc[0] + return samples_df.reset_index(drop=True) + return samples_df.iloc[0].reset_index(drop=True) def gen_response(self, action, utterance=None, name=None, username=None, vrename=None, state=None, consec_history=None, chitchat_history=None): diff --git a/VRE.py b/VRE.py index 7272f30..3d14990 100644 --- a/VRE.py +++ b/VRE.py @@ -14,7 +14,7 @@ import urllib.request import time import threading import html2text - +from datasets import Dataset class VRE: def __init__(self, name, token, retriever, directory='/app/'): @@ -35,12 +35,12 @@ class VRE: 'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created', 'url']), 'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']), 'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])} - self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'), - 'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'), - 'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'), - 'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'), - 'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'), - 'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')} + self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_titles_index'), + 'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else Dataset.load_from_disk(self.directory + 'janet_paper_titles_index'), + 'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_desc_index'), + 'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else Dataset.load_from_disk(self.directory + 'janet_paper_desc_index'), + 'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else Dataset.load_from_disk(self.directory + 'janet_content_index'), + 'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else Dataset.load_from_disk(self.directory + 'janet_post_index')} self.new_income = False def init(self): @@ -48,69 +48,59 @@ class VRE: self.get_content() if self.index['dataset_titles_index'] is None: self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') - self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') + #self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') if self.index['dataset_desc_index'] is None: self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') - self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') + #self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') if self.index['paper_titles_index'] is None: self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') - self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') + #self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') if self.index['paper_desc_index'] is None: self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') - self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') + #self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') if self.index['content_index'] is None: self.create_index('content_db', 'content', 'content_index', 'janet_content_index') - self.populate_index('content_db', 'content', 'content_index', 'janet_content_index') + #self.populate_index('content_db', 'content', 'content_index', 'janet_content_index') if self.index['post_index'] is None: self.create_index('post_db', 'content', 'post_index', 'janet_post_index') - self.populate_index('post_db', 'content', 'post_index', 'janet_post_index') + #self.populate_index('post_db', 'content', 'post_index', 'janet_post_index') def index_periodic_update(self): if self.new_income: if len(self.db['content_db'])%100 != 0: self.create_index('content_db', 'content', 'content_index', 'janet_content_index') - self.populate_index('content_db', 'content', 'content_index', 'janet_content_index') + #self.populate_index('content_db', 'content', 'content_index', 'janet_content_index') if len(self.db['post_db'])%100 != 0: self.create_index('post_db', 'content', 'post_index', 'janet_post_index') - self.populate_index('post_db', 'content', 'post_index', 'janet_post_index') + #self.populate_index('post_db', 'content', 'post_index', 'janet_post_index') if len(self.db['paper_db'])%100 != 0: self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') - self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') - self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') + #self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') + #self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') if len(self.db['dataset_db'])%100 != 0: self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') - self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') - self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') + #self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') + #self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') self.new_income = False def create_index(self, db_type, attribute, index_type, filename): filename = self.directory + filename to_index = self.db[db_type][attribute] - for i, info in enumerate(to_index): - if i == 0: - emb = self.retriever.encode([info]) - sentence_embeddings = np.array(emb) - else: - emb = self.retriever.encode([info]) - sentence_embeddings = np.append(sentence_embeddings, emb, axis=0) - # number of partitions of the coarse quantizer = number of posting lists - # as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database - nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1 - code_size = 8 # = number of subquantizers = number of sub-vectors - n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings))) # n_bits of each code (8 -> 1 byte codes) - d = sentence_embeddings.shape[1] - coarse_quantizer = faiss.IndexFlatL2(d) # will keep centroids of coarse quantizer (for inverted list) - self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits) - self.index[index_type].train(sentence_embeddings) # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!) - faiss.write_index(self.index[index_type], filename) + dataset = Dataset.from_pandas(self.db[db_type]) + embeddings_dataset = dataset.map( + lambda x: {"embeddings": self.retriever.encode([x[attribute]])[0]} + ) + embeddings_dataset.save_to_disk(filename) + self.index[index_type] = embeddings_dataset + #faiss.write_index(self.index[index_type], filename) def populate_index(self, db_type, attribute, index_type, filename): filename = self.directory + filename @@ -185,15 +175,15 @@ class VRE: self.dataset_counter += 1 dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url] - self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True) - self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True) - self.db['post_db'] = post_df.sort_values(by='time', ascending=True).reset_index(drop=True) + self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True) + self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True) + self.db['post_db'] = post_df.sort_values(by='time', ascending=True) - other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content']) + #other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content']) for i, post in post_df.iterrows(): - if post['author'] != "Catalogue": + if post['author'] != "catalogue": self.content_counter+=1 - other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])] + content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])] """ for i, description in dataset_df.iterrows(): self.content_counter+=1 @@ -202,7 +192,7 @@ class VRE: self.content_counter+=1 other_content_df.loc[str(self.content_counter)] = [self.content_counter, -3, description['title'] + ' is a paper. ' + description['notes'] + ' It is about ' + ', '.join(description['tags']) ] """ - self.db['content_db'] = pd.concat([content_df, other_content_df]).reset_index(drop=True) + self.db['content_db'] = content_df self.db['paper_db'].to_json(self.directory + self.name + '_paper.json') self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json') @@ -282,17 +272,17 @@ class VRE: self.dataset_counter += 1 dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url] - self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True) - self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True) + self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)]) + self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)]) - self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)]).reset_index(drop=True) + + self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)]) self.db['post_db'].to_json(self.directory+self.name+'_post.json') - other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content']) for i, post in post_df.iterrows(): - if post['author'] != "Catalogue": + if post['author'] != "catalogue": self.content_counter+=1 - other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])] + content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])] """ for i, description in dataset_df.iterrows(): self.content_counter+=1 @@ -304,7 +294,7 @@ class VRE: self.db['paper_db'].to_json(self.directory + self.name + '_paper.json') self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json') - self.db['content_db'] = pd.concat([self.db['content_db'], content_df, other_content_df]).reset_index(drop=True) + self.db['content_db'] = pd.concat([self.db['content_db'], content_df]) self.db['content_db'].to_json(self.directory + self.name + '_content.json') if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty: self.new_income = True diff --git a/main.py b/main.py index 7d1a9c6..cfe307c 100644 --- a/main.py +++ b/main.py @@ -46,11 +46,16 @@ def vre_fetch(): rg.update_index(vre.get_index()) rg.update_db(vre.get_db()) + def user_interest_decay(token): while True: - print("decaying interests after 3 minutes for " + users[token]['username']) - time.sleep(180) - users[token]['user'].decay_interests() + if token in users: + print("decaying interests after 3 minutes for " + users[token]['username']) + time.sleep(180) + users[token]['user'].decay_interests() + else: + break + def clear_inactive(): while True: