enhance_janet

2023-04-15 10:52:01 +02:00 · 2023-04-15 10:52:01 +02:00 · b006bc01ce
parent ca125bb3cd
commit b006bc01ce
5 changed files with 374 additions and 148 deletions
--- a/DM.py
+++ b/DM.py
@ -41,6 +41,10 @@ class DM:
            else:
                if self.curr_state['intent'] == 'QA':
                    return "RetGen"
+                if self.curr_state['intent'] == 'EXPLAINPOST':
+                    return "findPost" 
+                if self.curr_state['intent'] == 'HELP':
+                    return "getHelp"
                elif self.curr_state['intent'] == 'CHITCHAT':
                    return "ConvGen"
                elif self.curr_state['intent'] == 'FINDPAPER':
--- a/NLU.py
+++ b/NLU.py
@ -87,7 +87,7 @@ class NLU:
        if score > 0.5:
            entities = self._entityextractor()
            offense = self._offensepredictor()
-            if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER'] and len(entities) == 0:
+            if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER', 'EXPLAINPOST'] and len(entities) == 0:
                return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": False}
            return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": True}
        else:
@ -97,7 +97,7 @@ class NLU:
                entities = self._entityextractor()
                offense = self._offensepredictor()
                if score > 0.5 or not self._ambigpredictor():
-                    if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER'] and len(entities) == 0:
+                    if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER', 'EXPLAINPOST'] and len(entities) == 0:
                        return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": False}
                    return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, 
                            "is_clear": True}
@ -107,6 +107,6 @@ class NLU:
            else:
                entities = self._entityextractor()
                offense = self._offensepredictor()
-                if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER'] and len(entities) == 0:
+                if intent in ['FINDPAPER', 'FINDDATASET', 'SUMMARIZEPAPER', 'EXPLAINPOST'] and len(entities) == 0:
                    return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": False}
                return {"modified_query": self.to_process, "intent": intent, "entities": entities, "is_offensive": offense, "is_clear": True}
--- a/ResponseGenerator.py
+++ b/ResponseGenerator.py
@ -7,7 +7,7 @@ import pandas as pd
 from datetime import datetime

 class ResponseGenerator:
-    def __init__(self, index, db,recommender,generators, retriever, num_retrieved=1):
+    def __init__(self, index, db,recommender,generators, retriever, num_retrieved=3):
        self.generators = generators
        self.retriever = retriever 
        self.recommender = recommender 
@ -16,6 +16,7 @@ class ResponseGenerator:
        self.num_retrieved = num_retrieved
        self.paper = {}
        self.dataset = {}
+        self.post = {}

    def update_index(self, index):
        self.index = index
@ -37,9 +38,18 @@ class ResponseGenerator:
        else:
            return {}
  
-    def _get_matching_authors(self, rsrc, author):
+    def _get_matching_authors(self, rsrc, author, recent=False):
        cand = self.db[rsrc].loc[self.db[rsrc]['author'] == author.lower()].reset_index(drop=True)
        if not cand.empty:
+          if recent:
+            index = 0
+            curr = 0
+            for i, row in cand.iterrows():
+              if row['time'] > curr:
+                index = i
+                curr = row['time']
+            return cand.loc[index] 
+          else:
            return cand.loc[0]
        else:
            return {}
@ -61,15 +71,17 @@ class ResponseGenerator:
        else:
            return []
  
-    def _search_index(self, index_type, db_type, query):
+    def _search_index(self, index_type, db_type, query, multi=False):
        xq = self.retriever.encode([query])
        D, I = self.index[index_type].search(xq, self.num_retrieved)
+        if multi:
+            return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True)
        return self.db[db_type].iloc[[I[0]][0]].reset_index(drop=True).loc[0]


    def gen_response(self, action, utterance=None, username=None, state=None, consec_history=None):
        if action == "Help":
-            return "Hey it's Janet! I am here to help you make use of the datasets and papers in the VRE. I can answer questions whose answers may be inside the papers. I can summarize papers for you. I can also chat with you. So, whichever it is, I am ready to chat!"
+            return "Hey it's Janet! I am here to help you make use of the datasets and papers in the catalogue. I can answer questions whose answers may be inside the papers. I can summarize papers for you. I can also chat with you. So, whichever it is, I am ready to chat!"
        elif action == "Recommend":
            prompt = self.recommender.make_recommendation(username)
            if prompt != "":
@ -79,10 +91,28 @@ class ResponseGenerator:
    
        elif action == "OffenseReject":
            return "I am sorry, I cannot answer to this kind of language"
-    
+        elif action == "getHelp":
+            return "I can answer questions related to the papers in the VRE's catalog. I can also get you the posts, papers and datasets from the catalogue if you specify a topic or an author. I am also capable of small talk and summarizing papers to an extent. Just text me what you want and I will do it :)"
+        
+        elif action == "findPost":
+            for entity in state['entities']:
+                if(entity['entity'] == 'TOPIC'):
+                    self.post = self._get_matching_topics('post_db', entity['value'])
+                    if len(self.post) > 0:
+                        return str("This is a relevant post: " + self.post['content'] + ' by ' + self.post['author'])
+                if(entity['entity'] == 'AUTHOR'):
+                    self.post = self._get_matching_authors('post_db', entity['value'], recent=True)
+                    if len(self.post) > 0:
+                        if len(self.post['tags']) > 0:
+                            return str("Here is the most recent post by: " + self.post['author'] + ', which is about ' + ', '.join(self.post['tags']) + self.post['content'])
+                        else:
+                            return str("Here is the most recent post by: " + self.post['author'] + ', ' + self.post['content'])
+            return "I could not find the post you are looking for."
+
        elif action == "ConvGen":
            gen_kwargs = {"length_penalty": 2.5, "num_beams":2, "max_length": 30, "repetition_penalty": 2.5, "temperature": 2}
-            answer = self.generators['chat']('history: '+ consec_history + ' ' + utterance + ' persona: ' + 'I am Janet. My name is Janet. I am an AI developed by CNR to help VRE users.' , **gen_kwargs)[0]['generated_text']
+            #answer = self.generators['chat']('history: '+ consec_history + ' ' + utterance + ' persona: ' + 'I am Janet. My name is Janet. I am an AI developed by CNR to help VRE users.' , **gen_kwargs)[0]['generated_text']
+            answer = self.generators['chat']('question: ' + utterance + 'context: My name is Janet. I am an AI developed by CNR to help VRE users. ' + consec_history  , **gen_kwargs)[0]['generated_text']
            return answer
    
        elif action == "findPaper":
@ -142,14 +172,21 @@ class ResponseGenerator:

        elif action == "RetGen":
            #retrieve the most relevant paragraph
-            content = str(self._search_index('content_index', 'content_db', utterance)['content'])
+            content = self._search_index('content_index', 'content_db', utterance, multi=True)#['content']
+            evidence = ""
+            ev = ""
+            for i, row in content.iterrows():
+                evidence = evidence + str(i+1) + ") " +  row['content'] + ' \n '
+                ev = ev + " " + row['content']
+            
+
            #generate the answer
-            gen_seq = 'question: '+utterance+" context: "+content
+            gen_seq = 'question: '+utterance+" context: "+ev
        
            #handle return random 2 answers
            gen_kwargs = {"length_penalty": 0.5, "num_beams":2, "max_length": 60, "repetition_penalty": 2.5, "temperature": 2}
            answer = self.generators['qa'](gen_seq, **gen_kwargs)[0]['generated_text']
-            return str(answer)
+            return "According to the following evidence: " + evidence + " \n ........" + "The answer is: " + answer
      
        elif action == "sumPaper":
            if len(self.paper) == 0:
@ -165,8 +202,8 @@ class ResponseGenerator:
            answer = ""
            for i, row in df.iterrows():
                gen_seq = 'summarize: '+row['content']
-                gen_kwargs = {"length_penalty": 1.5, "num_beams":6, "max_length": 120, "repetition_penalty": 2.5, "temperature": 2}
-                answer = self.generators['summ'](gen_seq, **gen_kwargs)[0]['generated_text'] + ' '
+                gen_kwargs = {"length_penalty": 1.5, "num_beams":6, "max_length": 30, "repetition_penalty": 2.5, "temperature": 2}
+                answer = answer + self.generators['summ'](gen_seq, **gen_kwargs)[0]['generated_text'] + ' '
            return answer
        elif action == "Clarify":
            if state['intent'] in ['FINDPAPER', 'SUMMARIZEPAPER'] and len(state['entities']) == 0:
@ -175,6 +212,8 @@ class ResponseGenerator:
            elif state['intent'] == 'FINDDATASET' and len(state['entities']) == 0:
                if len(self.dataset) == 0:
                    return 'Please specify the title, the topic of the dataset of interest.'
+            elif state['intent'] == 'EXPLAINPOST' and len(state['entities']) == 0:
+                return 'Please specify the the topic or the author of the post.'
            else:
                gen_kwargs = {"length_penalty": 2.5, "num_beams":8, "max_length": 120, "repetition_penalty": 2.5, "temperature": 2}
                question = self.generators['amb']('question: '+ utterance + ' context: ' + consec_history , **gen_kwargs)[0]['generated_text']
--- a/VRE.py
+++ b/VRE.py
@ -1,111 +1,148 @@
 from datetime import datetime
 import pandas as pd
+import string
+import re
 import requests
 import os
 from io import BytesIO
-import PyPDF2
 from tqdm.auto import tqdm
 import numpy as np
 import math
 import faiss
+import pdfquery
+import urllib.request
 import time
 import threading
+import html2text
+

 class VRE:
    def __init__(self, name, token, retriever, directory='/app/'):
        self.name = name
        self.token = token
        self.catalogue_url = 'https://api.d4science.org/catalogue/items/'
+        self.socialnetwork_url = 'https://api.d4science.org/rest/2/posts/get-posts-vre/'
        self.headers = headers = {"gcube-token": self.token, "Accept": "application/json"}
        self.lastupdatetime = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()
+        self.postlastupdate = datetime.strptime('2021-01-01T00:00:00.000000', '%Y-%m-%dT%H:%M:%S.%f').timestamp()
        self.retriever = retriever
        self.directory = directory
+        self.post_counter = 0
        self.paper_counter = 0
        self.dataset_counter = 0
        self.content_counter = 0
        self.db = {'paper_db': pd.read_json(self.directory + self.name + '_paper.json') if os.path.isfile(self.directory + self.name + '_paper.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']),
                   'dataset_db': pd.read_json(self.directory + self.name + '_dataset.json') if os.path.isfile(self.directory + self.name + '_dataset.json') else pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']), 
-                   'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content'])}
+                   'content_db': pd.read_json(self.directory + self.name + '_content.json') if os.path.isfile(self.directory + self.name + '_content.json') else pd.DataFrame(columns=['id', 'paperid', 'content']),
+                   'post_db': pd.read_json(self.directory + self.name + '_post.json') if os.path.isfile(self.directory + self.name + '_post.json') else pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])}
        self.index = {'dataset_titles_index': None if not os.path.isfile(self.directory + 'janet_dataset_titles_index') else faiss.read_index(self.directory + 'janet_dataset_titles_index'),
                  'paper_titles_index': None if not os.path.isfile(self.directory + 'janet_paper_titles_index') else faiss.read_index(self.directory + 'janet_paper_titles_index'),
                  'dataset_desc_index': None if not os.path.isfile(self.directory + 'janet_dataset_desc_index') else faiss.read_index(self.directory + 'janet_dataset_desc_index'),
                  'paper_desc_index': None if not os.path.isfile(self.directory + 'janet_paper_desc_index') else faiss.read_index(self.directory + 'janet_paper_desc_index'),
-                  'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index')}
+                  'content_index': None if not os.path.isfile(self.directory + 'janet_content_index') else faiss.read_index(self.directory + 'janet_content_index'),
+                  'post_index': None if not os.path.isfile(self.directory + 'janet_post_index') else faiss.read_index(self.directory + 'janet_post_index')}
        self.new_income = False

    def init(self):
-      #first run 
-      if not os.path.isfile(self.directory + self.name + '_dataset' + '.json') or not os.path.isfile(self.directory + self.name + '_paper' + '.json') or not os.path.isfile(self.directory + self.name + '_content' + '.json'): 
-        self.get_content()
-      if self.index['dataset_titles_index'] is None:
-        self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
-        self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
-        
-      if self.index['dataset_desc_index'] is None:
-        self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
-        self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
-        
-      if self.index['paper_titles_index'] is None:
-        self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
-        self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
-        
-      if self.index['paper_desc_index'] is None:
-        self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
-        self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
-        
-      if self.index['content_index'] is None:
-        self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
-        self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
-           
+        #first run 
+        if not os.path.isfile(self.directory + self.name + '_dataset' + '.json') or not os.path.isfile(self.directory + self.name + '_paper' + '.json') or not os.path.isfile(self.directory + self.name + '_content' + '.json') or not os.path.isfile(self.directory + self.name + '_post' + '.json'): 
+            self.get_content()
+        if self.index['dataset_titles_index'] is None:
+            self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
+            self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
+                
+        if self.index['dataset_desc_index'] is None:
+            self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
+            self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
+            
+        if self.index['paper_titles_index'] is None:
+            self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
+            self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
+            
+        if self.index['paper_desc_index'] is None:
+            self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
+            self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
+            
+        if self.index['content_index'] is None:
+            self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
+            self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
+          
+        if self.index['post_index'] is None:
+            self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
+            self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')

    def index_periodic_update(self):
-      if self.new_income:         
-        if len(self.db['content_db'])%100 != 0:
-          self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
-          self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
-        if len(self.db['paper_db'])%100 != 0:
-          self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
-          self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
-          self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
-          self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
-        if len(self.db['dataset_db'])%100 != 0:
-          self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
-          self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
-          self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
-          self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
-        self.new_income = False
+        if self.new_income:         
+            if len(self.db['content_db'])%100 != 0:
+                self.create_index('content_db', 'content', 'content_index', 'janet_content_index')
+                self.populate_index('content_db', 'content', 'content_index', 'janet_content_index')
+            if len(self.db['post_db'])%100 != 0:
+                self.create_index('post_db', 'content', 'post_index', 'janet_post_index')
+                self.populate_index('post_db', 'content', 'post_index', 'janet_post_index')
+            if len(self.db['paper_db'])%100 != 0:
+                self.create_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
+                self.populate_index('paper_db', 'title', 'paper_titles_index', 'janet_paper_titles_index')
+                self.create_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
+                self.populate_index('paper_db', 'notes', 'paper_desc_index', 'janet_paper_desc_index')
+            if len(self.db['dataset_db'])%100 != 0:
+                self.create_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
+                self.populate_index('dataset_db', 'title', 'dataset_titles_index', 'janet_dataset_titles_index')
+                self.create_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
+                self.populate_index('dataset_db', 'notes', 'dataset_desc_index', 'janet_dataset_desc_index')
+            self.new_income = False

    def create_index(self, db_type, attribute, index_type, filename):
-      filename = self.directory + filename
-      to_index = self.db[db_type][attribute]
-      for i, info in enumerate(to_index):
-        if i == 0:
-          emb = self.retriever.encode([info])
-          sentence_embeddings = np.array(emb)
-        else:
-          emb = self.retriever.encode([info])
-          sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)
+        filename = self.directory + filename
+        to_index = self.db[db_type][attribute]
+        for i, info in enumerate(to_index):
+            if i == 0:
+                emb = self.retriever.encode([info])
+                sentence_embeddings = np.array(emb)
+            else:
+                emb = self.retriever.encode([info])
+                sentence_embeddings = np.append(sentence_embeddings, emb, axis=0)

-      # number of partitions of the coarse quantizer = number of posting lists
-      # as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
-      nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
-      code_size = 8  # = number of subquantizers = number of sub-vectors
-      n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings)))  # n_bits of each code (8 -> 1 byte codes)
-      d = sentence_embeddings.shape[1]
-      coarse_quantizer = faiss.IndexFlatL2(d)  # will keep centroids of coarse quantizer (for inverted list)
-      self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
-      self.index[index_type].train(sentence_embeddings)  # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
-      faiss.write_index(self.index[index_type], filename)
+        # number of partitions of the coarse quantizer = number of posting lists
+        # as rule of thumb, 4*sqrt(N) < nlist < 16*sqrt(N), where N is the size of the database
+        nlist = int(4 * math.sqrt(len(sentence_embeddings))) if int(4 * math.sqrt(len(sentence_embeddings))) < len(sentence_embeddings) else len(sentence_embeddings)-1
+        code_size = 8  # = number of subquantizers = number of sub-vectors
+        n_bits = 4 if len(sentence_embeddings) >= 2**4 else int(math.log2(len(sentence_embeddings)))  # n_bits of each code (8 -> 1 byte codes)
+        d = sentence_embeddings.shape[1]
+        coarse_quantizer = faiss.IndexFlatL2(d)  # will keep centroids of coarse quantizer (for inverted list)
+        self.index[index_type] = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, code_size, n_bits)
+        self.index[index_type].train(sentence_embeddings)  # train on a random subset to speed up k-means (NOTE: ensure they are randomly chosen!)
+        faiss.write_index(self.index[index_type], filename)
    
    def populate_index(self, db_type, attribute, index_type, filename):
-      filename = self.directory + filename
-      to_index = self.db[db_type][attribute]
-      for info in to_index:
-        sentence_embedding = np.array(self.retriever.encode([info])) 
-        self.index[index_type].add(sentence_embedding)
-      faiss.write_index(self.index[index_type], filename)
+        filename = self.directory + filename
+        to_index = self.db[db_type][attribute]
+        for info in to_index:
+            sentence_embedding = np.array(self.retriever.encode([info])) 
+            self.index[index_type].add(sentence_embedding)
+        faiss.write_index(self.index[index_type], filename)

    def get_content(self):
+        h = html2text.HTML2Text()
+        h.ignore_links = True
+        #posts 
+        posts = requests.get(self.socialnetwork_url, headers=headers)
+        posts = posts.json()['result']
+        post_df = pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])
+        
+        for post in posts:
+            author = post['full_name'].lower()
+            content = h.handle(post['description']).replace('\n', ' ').lower()
+            date = post['time']
+            tags = []
+            for word in content.split():
+              if word[0] == '#':
+                tags.append(word[1:])
+            if date > self.postlastupdate:
+                self.postlastupdate = date
+            self.post_counter += 1
+            post_df.loc[str(self.post_counter)] = [self.post_counter, author, content, date, tags]          
+
+        #catalog
        response = requests.get(self.catalogue_url, headers=self.headers)
        items = response.json()
        items_data = []
@ -148,90 +185,234 @@ class VRE:

        self.db['paper_db'] = paper_df.sort_values(by='metadata_created', ascending=True)
        self.db['dataset_db'] = dataset_df.sort_values(by='metadata_created', ascending=True)
+        self.db['post_db'] = post_df.sort_values(by='time', ascending=True)
        self.db['content_db'] = content_df

        self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
        self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')

        self.db['content_db'].to_json(self.directory + self.name + '_content.json')
+        self.db['post_db'].to_json(self.directory+self.name+'_post.json')

    # modify query
    def get_vre_update(self):
-      print("Getting new items")
-      response = requests.get(self.catalogue_url, headers=self.headers)
-      items = response.json()
-      items_data = []
-      for item in items:
-          api_url = self.catalogue_url + item + '/'
-          response = requests.get(api_url, headers=self.headers)
-          if datetime.strptime(response.json()['metadata_created'],'%Y-%m-%dT%H:%M:%S.%f').timestamp() > self.lastupdatetime:
-              items_data.append(response.json())
-
-      keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']
-        
-      paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
-      dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
-      content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
-    
-      for item in items_data:
-          for el in item['extras']:
-              if el['key'] == 'system:type':
-                  rsrc = el['value']
-          resources = []
-          for resource in item['resources']:
-              resources.append(
-                  {'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
+        print("Getting new items")
+        h = html2text.HTML2Text()
+        h.ignore_links = True
+        #posts 
+        posts = requests.get(self.socialnetwork_url, headers=headers)
+        posts = posts.json()['result']
+        new_posts = []
+        for post in posts:
+            if post['time'] > self.postlastupdate:
+              new_posts.append(post)
+          
+        post_df = pd.DataFrame(columns=['id', 'author', 'content', 'time', 'tags'])
+          
+        for post in new_posts:
+          author = post['full_name'].lower()
+          content = h.handle(post['description']).replace('\n', ' ').lower()
+          date = post['time']
          tags = []
-          for tag in item['tags']:
-              tags.append(tag['name'].lower())
-          title = item['title'].lower()
-          author = item['author'].lower()
-          notes = item['notes'].lower()
-          date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()
-          if date > self.lastupdatetime:
-              self.lastupdatetime = date
+          for word in content.split():
+              if word[0] == '#':
+                  tags.append(word[1:])
+          if date > self.postlastupdate:
+              self.postlastupdate = date
+          self.post_counter += 1
+          post_df.loc[str(self.post_counter)] = [self.post_counter, author, content, date, tags]          

-          if rsrc == 'Paper':
-            self.paper_counter += 1
-            paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]
-            content_df = self.get_pdf_content(item, content_df)
-            content_df = self.get_txt_content(item, content_df)
-          if rsrc == 'Dataset':
-            self.dataset_counter += 1
-            dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]
+        #catalog
+        response = requests.get(self.catalogue_url, headers=self.headers)
+        items = response.json()
+        items_data = []
+        for item in items:
+            api_url = self.catalogue_url + item + '/'
+            response = requests.get(api_url, headers=self.headers)
+            if datetime.strptime(response.json()['metadata_created'],'%Y-%m-%dT%H:%M:%S.%f').timestamp() > self.lastupdatetime:
+                items_data.append(response.json())

-      self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
-      self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
+        keys = ['type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created']
+          
+        paper_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
+        dataset_df = pd.DataFrame(columns=['id', 'type', 'resources', 'tags', 'title', 'author', 'notes', 'metadata_created'])
+        content_df = pd.DataFrame(columns=['id', 'paperid', 'content'])
+      
+        for item in items_data:
+            for el in item['extras']:
+                if el['key'] == 'system:type':
+                    rsrc = el['value']
+            resources = []
+            for resource in item['resources']:
+                resources.append(
+                    {'name': resource['name'].lower(), 'url': resource['url'], 'description': resource['description'].lower()})
+            tags = []
+            for tag in item['tags']:
+                tags.append(tag['name'].lower())
+            title = item['title'].lower()
+            author = item['author'].lower()
+            notes = item['notes'].lower()
+            date = datetime.strptime(item['metadata_created'], '%Y-%m-%dT%H:%M:%S.%f').timestamp()
+            if date > self.lastupdatetime:
+                self.lastupdatetime = date

-      self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
-      self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
-      self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
-      self.db['content_db'].to_json(self.directory + self.name + '_content.json')
-      if not paper_df.empty or not dataset_df.empty or not content_df.empty:
-        self.new_income = True            
+            if rsrc == 'Paper':
+                self.paper_counter += 1
+                paper_df.loc[str(self.paper_counter)] = [self.paper_counter, rsrc, resources, tags, title, author, notes, date]
+                content_df = self.get_pdf_content(item, content_df)
+                content_df = self.get_txt_content(item, content_df)
+            if rsrc == 'Dataset':
+                self.dataset_counter += 1
+                dataset_df.loc[str(self.dataset_counter)] = [self.dataset_counter, rsrc, resources, tags, title, author, notes, date]

+        self.db['paper_db'] = pd.concat([self.db['paper_db'], paper_df.sort_values(by='metadata_created', ascending=True)])
+        self.db['dataset_db'] = pd.concat([self.db['dataset_db'], dataset_df.sort_values(by='metadata_created', ascending=True)])
+        
+        self.db['post_db'] = pd.concat([self.db['post_db'], post_df.sort_values(by='time', ascending=True)])
+        self.db['post_db'].to_json(self.directory+self.name+'_post.json')
+        
+        self.db['paper_db'].to_json(self.directory + self.name + '_paper.json')
+        self.db['dataset_db'].to_json(self.directory + self.name + '_dataset.json')
+        self.db['content_db'] = pd.concat([self.db['content_db'], content_df])
+        self.db['content_db'].to_json(self.directory + self.name + '_content.json')
+        if not paper_df.empty or not dataset_df.empty or not content_df.empty or not post_df.empty:
+            self.new_income = True            
+    
+
+    def remove_useless_dots(self, line):
+        modline = ''
+        for i in range(0, len(line)):
+            if line[i] != '.':
+                modline+=line[i]
+            if line[i] == '.':
+                if line[i-2] == ' ' or line[i-2] in string.punctuation:
+                    continue
+                if line[i-1] == '.':
+                    continue
+                if line[i-3] == ' ' and line[i-2] == 'a' and line[i-1] == 'l':
+                    continue
+                modline+=line[i]
+        modline = re.sub(r'\.+', ".", modline)
+        return modline
+
+    def check_if_sentence(self, sentence):
+        if (len(sentence.split())) > 9 or '.' in sentence:
+            return True
+        return False
+    
+    def get_abstract(self, text): 
+        abstract_start = 0
+        abstract_end = len(text)
+        for i in range(0, len(text)):
+            if len(text[i].split()) > 0:
+                words = text[i].split()
+                if words[0].lower() == 'abstract':
+                    abstract_start = i
+                    for j in range(i+1, len(text)):
+                        if len(text[j]) == 0 and j > i+5:
+                            abstract_end = j
+                            break
+                    break
+        return abstract_start, abstract_end    
+      
+    def useful_index(self, text):
+        start = 0
+        end = len(text)
+        for i in range(0, len(text)):
+            if len(text[i].split()) > 0:
+                words = text[i].split()
+                if words[0].lower() in ['bibliography','references']:
+                    if i < end:
+                        end = i
+                if words[0].lower() in ['introduction', '1 introduction', '1. introduction', '1.introduction']:
+                    start = i
+                if words[0].lower() in ['acknowledgement', 'acknowledgements']:
+                    if i < end:
+                        end = i
+        return start, end
+      
+    def get_line_sentences(self, text, i):
+        mytext = self.remove_useless_dots(text[i])
+        if self.check_if_sentence(mytext):
+            splits = mytext.split('.')
+            for j in range(len(splits)):
+                if j+1 < len(splits):
+                    splits[j] = splits[j]+'. '
+                if j == len(splits)-1:
+                    splits[j] = splits[j].removesuffix('-')
+            return splits, i+1
+        else:
+            return [], i+1
+
+    def parts_to_sentences(self, parts):
+        sentences = []
+        sentence = ''
+        for part in parts:
+            sentence += part
+            if '.' in sentence:
+                sentences.append(sentence)
+                sentence = ''
+        return sentences
+      
    def get_pdf_content(self, item, df):
-      for rsrc in tqdm(item['resources']):
-        response = requests.get(rsrc['url'])
-        if 'application/pdf' in response.headers.get('content-type'):
-          my_raw_data = response.content
-          with BytesIO(my_raw_data) as data:
-            read_pdf = PyPDF2.PdfReader(data)
-            for page in tqdm(range(len(read_pdf.pages))):
-              content = read_pdf.pages[page].extract_text()
-              self.content_counter += 1
-              df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, content]
-      return df
+        for rsrc in tqdm(item['resources']):
+            response = requests.get(rsrc['url'])
+            if 'application/pdf' in response.headers.get('content-type'):
+                urllib.request.urlretrieve(rsrc['url'], self.directory + "janet.pdf")
+                pdf = pdfquery.PDFQuery(self.directory + "janet.pdf")
+                pdf.load()
+                #pages = pdf.pq('LTPage')
+                text = []
+              
+                for i, el in enumerate(pdf.tree.getiterator()):
+                    if el.tag == 'LTTextLineHorizontal' or el.tag == 'LTTextBoxHorizontal':
+                        text.append(el.text)
+                  
+                paragraphs = []
+                parts = []
+                i, end = self.useful_index(text)
+                while i < end:
+                    sent, i = self.get_line_sentences(text, i)
+                    for part in sent:
+                        if part!='':
+                            x = part
+                            if len(part) > 1 and part[0] == ' ':
+                                x = part[1:]
+                            if len(part) > 2 and part[1] == ' ':
+                                x = part[2:]
+                            parts.append(x)
+                sentences = self.parts_to_sentences(parts)
+                for i in range(0, len(sentences)-4, 5):
+                    paragraph = sentences[i] + sentences[i+1] + sentences[i+2] + sentences[i+3] + sentences[i+4]
+                    paragraphs.append(paragraph) 
+                for paragraph in tqdm(paragraphs):
+                    self.content_counter += 1
+                    df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, paragraph]
+          
+                start, end = self.get_abstract(text)
+                abstract = ''
+                for i in range(start, end):
+                    abstract += text[i]
+                self.content_counter += 1
+                df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, abstract]
+        return df

    def get_txt_content(self, item, df):
-      for rsrc in tqdm(item['resources']):
-        response = requests.get(rsrc['url'])
-        if 'text/plain' in response.headers.get('content-type'):
-          content = response.text
-          self.content_counter += 1
-          df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, content]
-      return df
+        for rsrc in tqdm(item['resources']):
+            response = requests.get(rsrc['url'])
+            if 'text/plain' in response.headers.get('content-type'):
+                content = response.text
+                content = self.remove_useless_dots(content)
+                sentences = content.split('.')
+                paragraphs = []
+                for i in range(0, len(sentences)-4, 5):
+                    paragraph = sentences[i] + '. ' + sentences[i+1]+ '. ' + sentences[i+2]+ '. ' + sentences[i+3] + '. ' + sentences[i+4]+ '. '
+                    paragraphs.append(paragraph) 
+                for paragraph in tqdm(paragraphs):
+                    self.content_counter += 1
+                    df.loc[str(self.content_counter)] = [self.content_counter, self.paper_counter, paragraph]
+        return df
    def get_db(self):
-      return self.db
+        return self.db
    def get_index(self):
-      return self.index
+        return self.index
--- a/requirements.txt
+++ b/requirements.txt
@ -7,6 +7,8 @@ nltk==3.7
 numpy==1.22.4
 pandas==1.3.5
 PyPDF2==3.0.1
+pdfquery
+html2text
 regex==2022.6.2
 requests==2.25.1
 scikit-learn==1.0.2