enhance_janet
This commit is contained in:
parent
ca125bb3cd
commit
b006bc01ce
4
DM.py
4
DM.py
|
@ -41,6 +41,10 @@ class DM:
|
|||
else:
|
||||
if self.curr_state['intent'] == 'QA':
|
||||
return "RetGen"
|
||||
if self.curr_state['intent'] == 'EXPLAINPOST':
|
||||
return "findPost"
|
||||
if self.curr_state['intent'] == 'HELP':
|
||||
return "getHelp"
|
||||
elif self.curr_state['intent'] == 'CHITCHAT':
|
||||
return "ConvGen"
|
||||
elif self.curr_state['intent'] == 'FINDPAPER':
|
||||
|
|
6
NLU.py
6
NLU.py
|
@ -87,7 +87,7 @@ class NLU:
|
|||
if score > 0.5:
|
||||
entities = self._entityextractor()
|
||||
offense = self._offensepredictor()
|
||||
if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER'] and len(entities) == 0:
|
||||
if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER', 'EXPLAINPOST'] and len(entities) == 0:
|
||||
return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": False}
|
||||
return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": True}
|
||||
else:
|
||||
|
@ -97,7 +97,7 @@ class NLU:
|
|||
entities = self._entityextractor()
|
||||
offense = self._offensepredictor()
|
||||
if score > 0.5 or not self._ambigpredictor():
|
||||
if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER'] and len(entities) == 0:
|
||||
if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER', 'EXPLAINPOST'] and len(entities) == 0:
|
||||
return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": False}
|
||||
return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense,
|
||||
"is_clear": True}
|
||||
|
@ -107,6 +107,6 @@ class NLU:
|
|||
else:
|
||||
entities = self._entityextractor()
|
||||
offense = self._offensepredictor()
|
||||
if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER'] and len(entities) == 0:
|
||||
if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER', 'EXPLAINPOST'] and len(entities) == 0:
|
||||
return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": False}
|
||||
return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": True}
|
||||
|
|
|
@ -7,7 +7,7 @@ import pandas as pd
|
|||
from datetime import datetime
|
||||
|
||||
class ResponseGenerator:
|
||||
def __init__(self, index, db,recommender,generators, retriever, num_retrieved=1):
|
||||
def __init__(self, index, db,recommender,generators, retriever, num_retrieved=3):
|
||||
self.generators = generators
|
||||
self.retriever = retriever
|
||||
self.recommender = recommender
|
||||
|
@ -16,6 +16,7 @@ class ResponseGenerator:
|
|||
self.num_retrieved = num_retrieved
|
||||
self.paper = {}
|
||||
self.dataset = {}
|
||||
self.post = {}
|
||||
|
||||
def update_index(self, index):
|
||||
self.index = index
|
||||
|
@ -37,9 +38,18 @@ class ResponseGenerator:
|
|||
else:
|
||||
return {}
|
||||
|
||||
def _get_matching_authors(self, rsrc, author):
|
||||
def _get_matching_authors(self, rsrc, author, recent=False):
|
||||
cand = self.db[rsrc].loc[self.db[rsrc]['author'] == author.lower()].reset_index(drop=True)
|
||||
if not cand.empty:
|
||||
if recent:
|
||||
index = 0
|
||||
curr = 0
|
||||
for i, row in cand.iterrows():
|
||||
if row['time'] > curr:
|
||||
index = i
|
||||
curr = row['time']
|
||||
return cand.loc[index]
|
||||
else:
|
||||
return cand.loc[0]
|
||||
else:
|
||||
return {}
|
||||
|
@ -61,15 +71,17 @@ class ResponseGenerator:
|
|||
else:
|
||||
return []
|
||||
|
||||
def _search_index(self, index_type, db_type, query):
|
||||
def _search_index(self, index_type, db_type, query, multi=False):
|
||||
xq = self.retriever.encode([query])
|
||||
D, I = self.index[index_type].search(xq, self.num_retrieved)
|
||||
if multi:
|
||||
return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True)
|
||||
return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True).loc[0]
|
||||
|
||||
|
||||
def gen_response(self, action, utterance=None, username=None, state=None, consec_history=None):
|
||||
if action == "Help":
|
||||
return "Hey it's Janet! I am here to help you make use of the datasets and papers in the VRE. I can answer questions whose answers may be inside the papers. I can summarize papers for you. I can also chat with you. So, whichever it is, I am ready to chat!"
|
||||
return "Hey it's Janet! I am here to help you make use of the datasets and papers in the catalogue. I can answer questions whose answers may be inside the papers. I can summarize papers for you. I can also chat with you. So, whichever it is, I am ready to chat!"
|
||||
elif action == "Recommend":
|
||||
prompt = self.recommender.make_recommendation(username)
|
||||
if prompt != "":
|
||||
|
@ -79,10 +91,28 @@ class ResponseGenerator:
|
|||
|
||||
elif action == "OffenseReject":
|
||||
return "I am sorry, I cannot answer to this kind of language"
|
||||
|
||||
elif action == "getHelp":
|
||||
return "I can answer questions related to the papers in the VRE's catalog. I can also get you the posts, papers and datasets from the catalogue if you specify a topic or an author. I am also capable of small talk and summarizing papers to an extent. Just text me what you want and I will do it :)"
|
||||
|
||||
elif action == "findPost":
|
||||
for entity in state['entities']:
|
||||
if(entity['entity'] == 'TOPIC'):
|
||||
self.post = self._get_matching_topics('post_db', entity['value'])
|
||||
if len(self.post) > 0:
|
||||
return str("This is a relevant post: " + self.post['content'] + ' by ' + self.post['author'])
|
||||
if(entity['entity'] == 'AUTHOR'):
|
||||
self.post = self._get_matching_authors('post_db', entity['value'], recent=True)
|
||||
if len(self.post) > 0:
|
||||
if len(self.post['tags']) > 0:
|
||||
return str("Here is the most recent post by: " + self.post['author'] + ', which is about ' + ', '.join(self.post['tags']) + self.post['content'])
|
||||
else:
|
||||
return str("Here is the most recent post by: " + self.post['author'] + ', ' + self.post['content'])
|
||||
return "I could not find the post you are looking for."
|
||||
|
||||
elif action == "ConvGen":
|
||||
gen_kwargs = {"length_penalty": 2.5, "num_beams":2, "max_length": 30, "repetition_penalty": 2.5, "temperature": 2}
|
||||
answer = self.generators['chat']('history: '+ consec_history + ' ' + utterance + ' persona: ' + 'I am Janet. My name is Janet. I am an AI developed by CNR to help VRE users.' , **gen_kwargs)[0]['generated_text']
|
||||
#answer = self.generators['chat']('history: '+ consec_history + ' ' + utterance + ' persona: ' + 'I am Janet. My name is Janet. I am an AI developed by CNR to help VRE users.' , **gen_kwargs)[0]['generated_text']
|
||||
answer = self.generators['chat']('question: ' + utterance + 'context: My name is Janet. I am an AI developed by CNR to help VRE users. ' + consec_history , **gen_kwargs)[0]['generated_text']
|
||||
return answer
|
||||
|
||||
elif action == "findPaper":
|
||||
|
@ -142,14 +172,21 @@ class ResponseGenerator:
|
|||
|
||||
elif action == "RetGen":
|
||||
#retrieve the most relevant paragraph
|
||||
content = str(self._search_index('content_index', 'content_db', utterance)['content'])
|
||||
content = self._search_index('content_index', 'content_db', utterance, multi=True)#['content']
|
||||
evidence = ""
|
||||
ev = ""
|
||||
for i, row in content.iterrows():
|
||||
evidence = evidence + str(i+1) + ") " + row['content'] + ' \n '
|
||||
ev = ev + " " + row['content']
|
||||
|
||||
|
||||
#generate the answer
|
||||
gen_seq = 'question: '+utterance+" context: "+content
|
||||
gen_seq = 'question: '+utterance+" context: "+ev
|
||||
|
||||
#handle return random 2 answers
|
||||
gen_kwargs = {"length_penalty": 0.5, "num_beams":2, "max_length": 60, "repetition_penalty": 2.5, "temperature": 2}
|
||||
answer = self.generators['qa'](gen_seq, **gen_kwargs)[0]['generated_text']
|
||||
return str(answer)
|
||||
return "According to the following evidence: " + evidence + " \n ........" + "The answer is: " + answer
|
||||
|
||||
elif action == "sumPaper":
|
||||
if len(self.paper) == 0:
|
||||
|
@ -165,8 +202,8 @@ class ResponseGenerator:
|
|||
answer = ""
|
||||
for i, row in df.iterrows():
|
||||
gen_seq = 'summarize: '+row['content']
|
||||
gen_kwargs = {"length_penalty": 1.5, "num_beams":6, "max_length": 120, "repetition_penalty": 2.5, "temperature": 2}
|
||||
answer = self.generators['summ'](gen_seq, **gen_kwargs)[0]['generated_text'] + ' '
|
||||
gen_kwargs = {"length_penalty": 1.5, "num_beams":6, "max_length": 30, "repetition_penalty": 2.5, "temperature": 2}
|
||||
answer = answer + self.generators['summ'](gen_seq, **gen_kwargs)[0]['generated_text'] + ' '
|
||||
return answer
|
||||
elif action == "Clarify":
|
||||
if state['intent'] in ['FINDPAPER', 'SUMMARIZEPAPER'] and len(state['entities']) == 0:
|
||||
|
@ -175,6 +212,8 @@ class ResponseGenerator:
|
|||
elif state['intent'] == 'FINDDATASET' and len(state['entities']) == 0:
|
||||
if len(self.dataset) == 0:
|
||||
return 'Please specify the title, the topic of the dataset of interest.'
|
||||
elif state['intent'] == 'EXPLAINPOST' and len(state['entities']) == 0:
|
||||
return 'Please specify the the topic or the author of the post.'
|
||||
else:
|
||||
gen_kwargs = {"length_penalty": 2.5, "num_beams":8, "max_length": 120, "repetition_penalty": 2.5, "temperature": 2}
|
||||
question = self.generators['amb']('question: '+ utterance + ' context: ' + consec_history , **gen_kwargs)[0]['generated_text']
|
||||
|
|
449
VRE.py
449
VRE.py
|
@ -1,111 +1,148 @@
|
|||
from datetime import datetime
|
||||
import pandas as pd
|
||||
import string
|
||||
import re
|
||||
import requests
|
||||
import os
|
||||
from io import BytesIO
|
||||
import PyPDF2
|
||||
from tqdm.auto import tqdm
|
||||
import numpy as np
|
||||
import math
|
||||
import faiss
|
||||
import pdfquery
|
||||
import urllib.request
|
||||
import time
|
||||
import threading
|
||||
import html2text
|
||||
|
||||
|
||||
class VRE:
|
||||
def __init__(self, name, token, retriever, directory='/app/'):
|
||||
self.name = name
|
||||
self.token = token
|
||||
self.catalogue_url = 'https://api.d4science.org/catalogue/items/'
|
||||
self.socialnetwork_url = 'https://api.d4science.org/rest/2/posts/get-posts-vre/'
|
||||
self.headers = headers = {"gcube-token": self.token, "Accept": "application/json"}
|
||||
self.lastupdatetime = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()
|
||||
self.postlastupdate = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()
|
||||
self.retriever = retriever
|
||||
self.directory = directory
|
||||
self.post_counter = 0
|
||||
self.paper_counter = 0
|
||||
self.dataset_counter = 0
|
||||
self.content_counter = 0
|
||||
self.db = {'paper_db': pd.read_json(self.directory + self.name + '_paper.json') if os.path.isfile(self.directory + self.name + '_paper.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']),
|
||||
'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']),
|
||||
'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content'])}
|
||||
'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']),
|
||||
'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])}
|
||||
self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'),
|
||||
'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'),
|
||||
'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'),
|
||||
'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'),
|
||||
'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index')}
|
||||
'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'),
|
||||
'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')}
|
||||
self.new_income = False
|
||||
|
||||
def init(self):
|
||||
#first run
|
||||
if not os.path.isfile(self.directory + self.name + '_dataset' + '.json') or not os.path.isfile(self.directory + self.name + '_paper' + '.json') or not os.path.isfile(self.directory + self.name + '_content' + '.json'):
|
||||
self.get_content()
|
||||
if self.index['dataset_titles_index'] is None:
|
||||
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
|
||||
if self.index['dataset_desc_index'] is None:
|
||||
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
|
||||
if self.index['paper_titles_index'] is None:
|
||||
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
|
||||
if self.index['paper_desc_index'] is None:
|
||||
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
|
||||
if self.index['content_index'] is None:
|
||||
self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
|
||||
#first run
|
||||
if not os.path.isfile(self.directory + self.name + '_dataset' + '.json') or not os.path.isfile(self.directory + self.name + '_paper' + '.json') or not os.path.isfile(self.directory + self.name + '_content' + '.json') or not os.path.isfile(self.directory + self.name + '_post' + '.json'):
|
||||
self.get_content()
|
||||
if self.index['dataset_titles_index'] is None:
|
||||
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
|
||||
if self.index['dataset_desc_index'] is None:
|
||||
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
|
||||
if self.index['paper_titles_index'] is None:
|
||||
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
|
||||
if self.index['paper_desc_index'] is None:
|
||||
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
|
||||
if self.index['content_index'] is None:
|
||||
self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
|
||||
if self.index['post_index'] is None:
|
||||
self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
|
||||
def index_periodic_update(self):
|
||||
if self.new_income:
|
||||
if len(self.db['content_db'])%100 != 0:
|
||||
self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
if len(self.db['paper_db'])%100 != 0:
|
||||
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
if len(self.db['dataset_db'])%100 != 0:
|
||||
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.new_income = False
|
||||
if self.new_income:
|
||||
if len(self.db['content_db'])%100 != 0:
|
||||
self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
|
||||
if len(self.db['post_db'])%100 != 0:
|
||||
self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
|
||||
if len(self.db['paper_db'])%100 != 0:
|
||||
self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
|
||||
self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
|
||||
if len(self.db['dataset_db'])%100 != 0:
|
||||
self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
|
||||
self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
|
||||
self.new_income = False
|
||||
|
||||
def create_index(self, db_type, attribute, index_type, filename):
|
||||
filename = self.directory + filename
|
||||
to_index = self.db[db_type][attribute]
|
||||
for i, info in enumerate(to_index):
|
||||
if i == 0:
|
||||
emb = self.retriever.encode([info])
|
||||
sentence_embeddings = np.array(emb)
|
||||
else:
|
||||
emb = self.retriever.encode([info])
|
||||
sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)
|
||||
filename = self.directory + filename
|
||||
to_index = self.db[db_type][attribute]
|
||||
for i, info in enumerate(to_index):
|
||||
if i == 0:
|
||||
emb = self.retriever.encode([info])
|
||||
sentence_embeddings = np.array(emb)
|
||||
else:
|
||||
emb = self.retriever.encode([info])
|
||||
sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)
|
||||
|
||||
# number of partitions of the coarse quantizer = number of posting lists
|
||||
# as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
|
||||
nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
|
||||
code_size = 8 # = number of subquantizers = number of sub-vectors
|
||||
n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings))) # n_bits of each code (8 -> 1 byte codes)
|
||||
d = sentence_embeddings.shape[1]
|
||||
coarse_quantizer = faiss.IndexFlatL2(d) # will keep centroids of coarse quantizer (for inverted list)
|
||||
self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
|
||||
self.index[index_type].train(sentence_embeddings) # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
|
||||
faiss.write_index(self.index[index_type], filename)
|
||||
# number of partitions of the coarse quantizer = number of posting lists
|
||||
# as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
|
||||
nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
|
||||
code_size = 8 # = number of subquantizers = number of sub-vectors
|
||||
n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings))) # n_bits of each code (8 -> 1 byte codes)
|
||||
d = sentence_embeddings.shape[1]
|
||||
coarse_quantizer = faiss.IndexFlatL2(d) # will keep centroids of coarse quantizer (for inverted list)
|
||||
self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
|
||||
self.index[index_type].train(sentence_embeddings) # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
|
||||
faiss.write_index(self.index[index_type], filename)
|
||||
|
||||
def populate_index(self, db_type, attribute, index_type, filename):
|
||||
filename = self.directory + filename
|
||||
to_index = self.db[db_type][attribute]
|
||||
for info in to_index:
|
||||
sentence_embedding = np.array(self.retriever.encode([info]))
|
||||
self.index[index_type].add(sentence_embedding)
|
||||
faiss.write_index(self.index[index_type], filename)
|
||||
filename = self.directory + filename
|
||||
to_index = self.db[db_type][attribute]
|
||||
for info in to_index:
|
||||
sentence_embedding = np.array(self.retriever.encode([info]))
|
||||
self.index[index_type].add(sentence_embedding)
|
||||
faiss.write_index(self.index[index_type], filename)
|
||||
|
||||
def get_content(self):
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True
|
||||
#posts
|
||||
posts = requests.get(self.socialnetwork_url, headers=headers)
|
||||
posts = posts.json()['result']
|
||||
post_df = pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])
|
||||
|
||||
for post in posts:
|
||||
author = post['full_name'].lower()
|
||||
content = h.handle(post['description']).replace('\n', ' ').lower()
|
||||
date = post['time']
|
||||
tags = []
|
||||
for word in content.split():
|
||||
if word[0] == '#':
|
||||
tags.append(word[1:])
|
||||
if date > self.postlastupdate:
|
||||
self.postlastupdate = date
|
||||
self.post_counter += 1
|
||||
post_df.loc[str(self.post_counter)] = [self.post_counter, author, content, date, tags]
|
||||
|
||||
#catalog
|
||||
response = requests.get(self.catalogue_url, headers=self.headers)
|
||||
items = response.json()
|
||||
items_data = []
|
||||
|
@ -148,90 +185,234 @@ class VRE:
|
|||
|
||||
self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)
|
||||
self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)
|
||||
self.db['post_db'] = post_df.sort_values(by='time', ascending=True)
|
||||
self.db['content_db'] = content_df
|
||||
|
||||
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
|
||||
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
|
||||
|
||||
self.db['content_db'].to_json(self.directory + self.name + '_content.json')
|
||||
self.db['post_db'].to_json(self.directory+self.name+'_post.json')
|
||||
|
||||
# modify query
|
||||
def get_vre_update(self):
|
||||
print("Getting new items")
|
||||
response = requests.get(self.catalogue_url, headers=self.headers)
|
||||
items = response.json()
|
||||
items_data = []
|
||||
for item in items:
|
||||
api_url = self.catalogue_url + item + '/'
|
||||
response = requests.get(api_url, headers=self.headers)
|
||||
if datetime.strptime(response.json()['metadata_created'],'%Y-%m-%dT%H:%M:%S.%f').timestamp() > self.lastupdatetime:
|
||||
items_data.append(response.json())
|
||||
|
||||
keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']
|
||||
|
||||
paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
|
||||
dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
|
||||
content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
|
||||
|
||||
for item in items_data:
|
||||
for el in item['extras']:
|
||||
if el['key'] == 'system:type':
|
||||
rsrc = el['value']
|
||||
resources = []
|
||||
for resource in item['resources']:
|
||||
resources.append(
|
||||
{'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
|
||||
print("Getting new items")
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = True
|
||||
#posts
|
||||
posts = requests.get(self.socialnetwork_url, headers=headers)
|
||||
posts = posts.json()['result']
|
||||
new_posts = []
|
||||
for post in posts:
|
||||
if post['time'] > self.postlastupdate:
|
||||
new_posts.append(post)
|
||||
|
||||
post_df = pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])
|
||||
|
||||
for post in new_posts:
|
||||
author = post['full_name'].lower()
|
||||
content = h.handle(post['description']).replace('\n', ' ').lower()
|
||||
date = post['time']
|
||||
tags = []
|
||||
for tag in item['tags']:
|
||||
tags.append(tag['name'].lower())
|
||||
title = item['title'].lower()
|
||||
author = item['author'].lower()
|
||||
notes = item['notes'].lower()
|
||||
date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()
|
||||
if date > self.lastupdatetime:
|
||||
self.lastupdatetime = date
|
||||
for word in content.split():
|
||||
if word[0] == '#':
|
||||
tags.append(word[1:])
|
||||
if date > self.postlastupdate:
|
||||
self.postlastupdate = date
|
||||
self.post_counter += 1
|
||||
post_df.loc[str(self.post_counter)] = [self.post_counter, author, content, date, tags]
|
||||
|
||||
if rsrc == 'Paper':
|
||||
self.paper_counter += 1
|
||||
paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]
|
||||
content_df = self.get_pdf_content(item, content_df)
|
||||
content_df = self.get_txt_content(item, content_df)
|
||||
if rsrc == 'Dataset':
|
||||
self.dataset_counter += 1
|
||||
dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]
|
||||
#catalog
|
||||
response = requests.get(self.catalogue_url, headers=self.headers)
|
||||
items = response.json()
|
||||
items_data = []
|
||||
for item in items:
|
||||
api_url = self.catalogue_url + item + '/'
|
||||
response = requests.get(api_url, headers=self.headers)
|
||||
if datetime.strptime(response.json()['metadata_created'],'%Y-%m-%dT%H:%M:%S.%f').timestamp() > self.lastupdatetime:
|
||||
items_data.append(response.json())
|
||||
|
||||
self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
|
||||
self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
|
||||
keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']
|
||||
|
||||
paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
|
||||
dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
|
||||
content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
|
||||
|
||||
for item in items_data:
|
||||
for el in item['extras']:
|
||||
if el['key'] == 'system:type':
|
||||
rsrc = el['value']
|
||||
resources = []
|
||||
for resource in item['resources']:
|
||||
resources.append(
|
||||
{'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
|
||||
tags = []
|
||||
for tag in item['tags']:
|
||||
tags.append(tag['name'].lower())
|
||||
title = item['title'].lower()
|
||||
author = item['author'].lower()
|
||||
notes = item['notes'].lower()
|
||||
date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()
|
||||
if date > self.lastupdatetime:
|
||||
self.lastupdatetime = date
|
||||
|
||||
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
|
||||
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
|
||||
self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
|
||||
self.db['content_db'].to_json(self.directory + self.name + '_content.json')
|
||||
if not paper_df.empty or not dataset_df.empty or not content_df.empty:
|
||||
self.new_income = True
|
||||
if rsrc == 'Paper':
|
||||
self.paper_counter += 1
|
||||
paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]
|
||||
content_df = self.get_pdf_content(item, content_df)
|
||||
content_df = self.get_txt_content(item, content_df)
|
||||
if rsrc == 'Dataset':
|
||||
self.dataset_counter += 1
|
||||
dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]
|
||||
|
||||
self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
|
||||
self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
|
||||
|
||||
self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)])
|
||||
self.db['post_db'].to_json(self.directory+self.name+'_post.json')
|
||||
|
||||
self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
|
||||
self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
|
||||
self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
|
||||
self.db['content_db'].to_json(self.directory + self.name + '_content.json')
|
||||
if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty:
|
||||
self.new_income = True
|
||||
|
||||
|
||||
def remove_useless_dots(self, line):
|
||||
modline = ''
|
||||
for i in range(0, len(line)):
|
||||
if line[i] != '.':
|
||||
modline+=line[i]
|
||||
if line[i] == '.':
|
||||
if line[i-2] == ' ' or line[i-2] in string.punctuation:
|
||||
continue
|
||||
if line[i-1] == '.':
|
||||
continue
|
||||
if line[i-3] == ' ' and line[i-2] == 'a' and line[i-1] == 'l':
|
||||
continue
|
||||
modline+=line[i]
|
||||
modline = re.sub(r'\.+', ".", modline)
|
||||
return modline
|
||||
|
||||
def check_if_sentence(self, sentence):
|
||||
if (len(sentence.split())) > 9 or '.' in sentence:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_abstract(self, text):
|
||||
abstract_start = 0
|
||||
abstract_end = len(text)
|
||||
for i in range(0, len(text)):
|
||||
if len(text[i].split()) > 0:
|
||||
words = text[i].split()
|
||||
if words[0].lower() == 'abstract':
|
||||
abstract_start = i
|
||||
for j in range(i+1, len(text)):
|
||||
if len(text[j]) == 0 and j > i+5:
|
||||
abstract_end = j
|
||||
break
|
||||
break
|
||||
return abstract_start, abstract_end
|
||||
|
||||
def useful_index(self, text):
|
||||
start = 0
|
||||
end = len(text)
|
||||
for i in range(0, len(text)):
|
||||
if len(text[i].split()) > 0:
|
||||
words = text[i].split()
|
||||
if words[0].lower() in ['bibliography','references']:
|
||||
if i < end:
|
||||
end = i
|
||||
if words[0].lower() in ['introduction', '1 introduction', '1. introduction', '1.introduction']:
|
||||
start = i
|
||||
if words[0].lower() in ['acknowledgement', 'acknowledgements']:
|
||||
if i < end:
|
||||
end = i
|
||||
return start, end
|
||||
|
||||
def get_line_sentences(self, text, i):
|
||||
mytext = self.remove_useless_dots(text[i])
|
||||
if self.check_if_sentence(mytext):
|
||||
splits = mytext.split('.')
|
||||
for j in range(len(splits)):
|
||||
if j+1 < len(splits):
|
||||
splits[j] = splits[j]+'. '
|
||||
if j == len(splits)-1:
|
||||
splits[j] = splits[j].removesuffix('-')
|
||||
return splits, i+1
|
||||
else:
|
||||
return [], i+1
|
||||
|
||||
def parts_to_sentences(self, parts):
|
||||
sentences = []
|
||||
sentence = ''
|
||||
for part in parts:
|
||||
sentence += part
|
||||
if '.' in sentence:
|
||||
sentences.append(sentence)
|
||||
sentence = ''
|
||||
return sentences
|
||||
|
||||
def get_pdf_content(self, item, df):
|
||||
for rsrc in tqdm(item['resources']):
|
||||
response = requests.get(rsrc['url'])
|
||||
if 'application/pdf' in response.headers.get('content-type'):
|
||||
my_raw_data = response.content
|
||||
with BytesIO(my_raw_data) as data:
|
||||
read_pdf = PyPDF2.PdfReader(data)
|
||||
for page in tqdm(range(len(read_pdf.pages))):
|
||||
content = read_pdf.pages[page].extract_text()
|
||||
self.content_counter += 1
|
||||
df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, content]
|
||||
return df
|
||||
for rsrc in tqdm(item['resources']):
|
||||
response = requests.get(rsrc['url'])
|
||||
if 'application/pdf' in response.headers.get('content-type'):
|
||||
urllib.request.urlretrieve(rsrc['url'], self.directory + "janet.pdf")
|
||||
pdf = pdfquery.PDFQuery(self.directory + "janet.pdf")
|
||||
pdf.load()
|
||||
#pages = pdf.pq('LTPage')
|
||||
text = []
|
||||
|
||||
for i, el in enumerate(pdf.tree.getiterator()):
|
||||
if el.tag == 'LTTextLineHorizontal' or el.tag == 'LTTextBoxHorizontal':
|
||||
text.append(el.text)
|
||||
|
||||
paragraphs = []
|
||||
parts = []
|
||||
i, end = self.useful_index(text)
|
||||
while i < end:
|
||||
sent, i = self.get_line_sentences(text, i)
|
||||
for part in sent:
|
||||
if part!='':
|
||||
x = part
|
||||
if len(part) > 1 and part[0] == ' ':
|
||||
x = part[1:]
|
||||
if len(part) > 2 and part[1] == ' ':
|
||||
x = part[2:]
|
||||
parts.append(x)
|
||||
sentences = self.parts_to_sentences(parts)
|
||||
for i in range(0, len(sentences)-4, 5):
|
||||
paragraph = sentences[i] + sentences[i+1] + sentences[i+2] + sentences[i+3] + sentences[i+4]
|
||||
paragraphs.append(paragraph)
|
||||
for paragraph in tqdm(paragraphs):
|
||||
self.content_counter += 1
|
||||
df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, paragraph]
|
||||
|
||||
start, end = self.get_abstract(text)
|
||||
abstract = ''
|
||||
for i in range(start, end):
|
||||
abstract += text[i]
|
||||
self.content_counter += 1
|
||||
df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, abstract]
|
||||
return df
|
||||
|
||||
def get_txt_content(self, item, df):
|
||||
for rsrc in tqdm(item['resources']):
|
||||
response = requests.get(rsrc['url'])
|
||||
if 'text/plain' in response.headers.get('content-type'):
|
||||
content = response.text
|
||||
self.content_counter += 1
|
||||
df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, content]
|
||||
return df
|
||||
for rsrc in tqdm(item['resources']):
|
||||
response = requests.get(rsrc['url'])
|
||||
if 'text/plain' in response.headers.get('content-type'):
|
||||
content = response.text
|
||||
content = self.remove_useless_dots(content)
|
||||
sentences = content.split('.')
|
||||
paragraphs = []
|
||||
for i in range(0, len(sentences)-4, 5):
|
||||
paragraph = sentences[i] + '. ' + sentences[i+1]+ '. ' + sentences[i+2]+ '. ' + sentences[i+3] + '. ' + sentences[i+4]+ '. '
|
||||
paragraphs.append(paragraph)
|
||||
for paragraph in tqdm(paragraphs):
|
||||
self.content_counter += 1
|
||||
df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, paragraph]
|
||||
return df
|
||||
def get_db(self):
|
||||
return self.db
|
||||
return self.db
|
||||
def get_index(self):
|
||||
return self.index
|
||||
return self.index
|
||||
|
|
|
@ -7,6 +7,8 @@ nltk==3.7
|
|||
numpy==1.22.4
|
||||
pandas==1.3.5
|
||||
PyPDF2==3.0.1
|
||||
pdfquery
|
||||
html2text
|
||||
regex==2022.6.2
|
||||
requests==2.25.1
|
||||
scikit-learn==1.0.2
|
||||
|
|
Loading…
Reference in New Issue