This commit is contained in:
ahmed531998 2023-04-21 04:08:30 +02:00
parent 6456dca7d2
commit 0615194c7f
4 changed files with 62 additions and 60 deletions

View File

@ -2,12 +2,12 @@ FROM python:3.8.10-slim
WORKDIR /backend_janet WORKDIR /backend_janet
COPY requirements_simple.txt . COPY requirements_main.txt .
RUN pip install -r requirements_simple.txt RUN pip install -r requirements_main.txt
RUN rm -fr /root/.cache/* RUN rm -fr /root/.cache/*
COPY . . COPY . .
ENTRYPOINT ["python", "main_simple.py"] ENTRYPOINT ["python", "main.py"]

View File

@ -5,6 +5,7 @@ from sklearn.metrics.pairwise import cosine_similarity
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from datetime import datetime from datetime import datetime
from datasets import Dataset
class ResponseGenerator: class ResponseGenerator:
def __init__(self, index, db,recommender,generators, retriever, num_retrieved=3): def __init__(self, index, db,recommender,generators, retriever, num_retrieved=3):
@ -81,11 +82,17 @@ class ResponseGenerator:
return [] return []
def _search_index(self, index_type, db_type, query, multi=False): def _search_index(self, index_type, db_type, query, multi=False):
xq = self.retriever.encode([query]) self.index[index_type].add_faiss_index(column="embeddings")
D, I = self.index[index_type].search(xq, self.num_retrieved) scores, samples = self.index[index_type].get_nearest_examples(
"embeddings", retriever.encode([query]), k=3
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)
if multi: if multi:
return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True) return samples_df.reset_index(drop=True)
return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True).loc[0] return samples_df.iloc[0].reset_index(drop=True)
def gen_response(self, action, utterance=None, name=None, username=None, vrename=None, state=None, consec_history=None, chitchat_history=None): def gen_response(self, action, utterance=None, name=None, username=None, vrename=None, state=None, consec_history=None, chitchat_history=None):

90
VRE.py
View File

@ -14,7 +14,7 @@ import urllib.request
import time import time
import threading import threading
import html2text import html2text
from datasets import Dataset
class VRE: class VRE:
def __init__(self, name, token, retriever, directory='/app/'): def __init__(self, name, token, retriever, directory='/app/'):
@ -35,12 +35,12 @@ class VRE:
'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created', 'url']), 'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created', 'url']),
'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']), 'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']),
'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])} 'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])}
self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'), self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_titles_index'),
'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'), 'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else Dataset.load_from_disk(self.directory + 'janet_paper_titles_index'),
'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'), 'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else Dataset.load_from_disk(self.directory + 'janet_dataset_desc_index'),
'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'), 'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else Dataset.load_from_disk(self.directory + 'janet_paper_desc_index'),
'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'), 'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else Dataset.load_from_disk(self.directory + 'janet_content_index'),
'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')} 'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else Dataset.load_from_disk(self.directory + 'janet_post_index')}
self.new_income = False self.new_income = False
def init(self): def init(self):
@ -48,69 +48,59 @@ class VRE:
self.get_content() self.get_content()
if self.index['dataset_titles_index'] is None: if self.index['dataset_titles_index'] is None:
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') #self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
if self.index['dataset_desc_index'] is None: if self.index['dataset_desc_index'] is None:
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') #self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
if self.index['paper_titles_index'] is None: if self.index['paper_titles_index'] is None:
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') #self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
if self.index['paper_desc_index'] is None: if self.index['paper_desc_index'] is None:
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') #self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
if self.index['content_index'] is None: if self.index['content_index'] is None:
self.create_index('content_db', 'content', 'content_index', 'janet_content_index') self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index') #self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
if self.index['post_index'] is None: if self.index['post_index'] is None:
self.create_index('post_db', 'content', 'post_index', 'janet_post_index') self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
self.populate_index('post_db', 'content', 'post_index', 'janet_post_index') #self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
def index_periodic_update(self): def index_periodic_update(self):
if self.new_income: if self.new_income:
if len(self.db['content_db'])%100 != 0: if len(self.db['content_db'])%100 != 0:
self.create_index('content_db', 'content', 'content_index', 'janet_content_index') self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index') #self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
if len(self.db['post_db'])%100 != 0: if len(self.db['post_db'])%100 != 0:
self.create_index('post_db', 'content', 'post_index', 'janet_post_index') self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
self.populate_index('post_db', 'content', 'post_index', 'janet_post_index') #self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
if len(self.db['paper_db'])%100 != 0: if len(self.db['paper_db'])%100 != 0:
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index') #self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index') #self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
if len(self.db['dataset_db'])%100 != 0: if len(self.db['dataset_db'])%100 != 0:
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index') #self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index') #self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
self.new_income = False self.new_income = False
def create_index(self, db_type, attribute, index_type, filename): def create_index(self, db_type, attribute, index_type, filename):
filename = self.directory + filename filename = self.directory + filename
to_index = self.db[db_type][attribute] to_index = self.db[db_type][attribute]
for i, info in enumerate(to_index):
if i == 0:
emb = self.retriever.encode([info])
sentence_embeddings = np.array(emb)
else:
emb = self.retriever.encode([info])
sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)
# number of partitions of the coarse quantizer = number of posting lists dataset = Dataset.from_pandas(self.db[db_type])
# as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database embeddings_dataset = dataset.map(
nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1 lambda x: {"embeddings": self.retriever.encode([x[attribute]])[0]}
code_size = 8 # = number of subquantizers = number of sub-vectors )
n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings))) # n_bits of each code (8 -> 1 byte codes) embeddings_dataset.save_to_disk(filename)
d = sentence_embeddings.shape[1] self.index[index_type] = embeddings_dataset
coarse_quantizer = faiss.IndexFlatL2(d) # will keep centroids of coarse quantizer (for inverted list) #faiss.write_index(self.index[index_type], filename)
self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
self.index[index_type].train(sentence_embeddings) # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
faiss.write_index(self.index[index_type], filename)
def populate_index(self, db_type, attribute, index_type, filename): def populate_index(self, db_type, attribute, index_type, filename):
filename = self.directory + filename filename = self.directory + filename
@ -185,15 +175,15 @@ class VRE:
self.dataset_counter += 1 self.dataset_counter += 1
dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url] dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url]
self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True) self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)
self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True).reset_index(drop=True) self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)
self.db['post_db'] = post_df.sort_values(by='time', ascending=True).reset_index(drop=True) self.db['post_db'] = post_df.sort_values(by='time', ascending=True)
other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content']) #other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
for i, post in post_df.iterrows(): for i, post in post_df.iterrows():
if post['author'] != "Catalogue": if post['author'] != "catalogue":
self.content_counter+=1 self.content_counter+=1
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])] content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
""" """
for i, description in dataset_df.iterrows(): for i, description in dataset_df.iterrows():
self.content_counter+=1 self.content_counter+=1
@ -202,7 +192,7 @@ class VRE:
self.content_counter+=1 self.content_counter+=1
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -3, description['title'] + ' is a paper. ' + description['notes'] + ' It is about ' + ', '.join(description['tags']) ] other_content_df.loc[str(self.content_counter)] = [self.content_counter, -3, description['title'] + ' is a paper. ' + description['notes'] + ' It is about ' + ', '.join(description['tags']) ]
""" """
self.db['content_db'] = pd.concat([content_df, other_content_df]).reset_index(drop=True) self.db['content_db'] = content_df
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json') self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json') self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
@ -282,17 +272,17 @@ class VRE:
self.dataset_counter += 1 self.dataset_counter += 1
dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url] dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date, url]
self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True) self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)]).reset_index(drop=True) self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)]).reset_index(drop=True)
self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)])
self.db['post_db'].to_json(self.directory+self.name+'_post.json') self.db['post_db'].to_json(self.directory+self.name+'_post.json')
other_content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
for i, post in post_df.iterrows(): for i, post in post_df.iterrows():
if post['author'] != "Catalogue": if post['author'] != "catalogue":
self.content_counter+=1 self.content_counter+=1
other_content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])] content_df.loc[str(self.content_counter)] = [self.content_counter, -1, post['author'] + ' posted: ' + post['content'] + ' It is about ' + ', '.join(post['tags'])]
""" """
for i, description in dataset_df.iterrows(): for i, description in dataset_df.iterrows():
self.content_counter+=1 self.content_counter+=1
@ -304,7 +294,7 @@ class VRE:
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json') self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json') self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
self.db['content_db'] = pd.concat([self.db['content_db'], content_df, other_content_df]).reset_index(drop=True) self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
self.db['content_db'].to_json(self.directory + self.name + '_content.json') self.db['content_db'].to_json(self.directory + self.name + '_content.json')
if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty: if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty:
self.new_income = True self.new_income = True

11
main.py
View File

@ -46,11 +46,16 @@ def vre_fetch():
rg.update_index(vre.get_index()) rg.update_index(vre.get_index())
rg.update_db(vre.get_db()) rg.update_db(vre.get_db())
def user_interest_decay(token): def user_interest_decay(token):
while True: while True:
print("decaying interests after 3 minutes for " + users[token]['username']) if token in users:
time.sleep(180) print("decaying interests after 3 minutes for " + users[token]['username'])
users[token]['user'].decay_interests() time.sleep(180)
users[token]['user'].decay_interests()
else:
break
def clear_inactive(): def clear_inactive():
while True: while True: