Documents older than a certain timestamp will be now automatically removed after indexing of newer posts (or updates of them)

git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/social-networking/social-data-indexer-se-plugin@124391 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
Costantino Perciante 2016-02-19 17:42:45 +00:00
parent bc2bb8d0a9
commit 983ee1d6c4
1 changed files with 55 additions and 17 deletions

View File

@ -17,6 +17,7 @@ import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.delete.DeleteResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.transport.NoNodeAvailableException;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
@ -24,13 +25,15 @@ import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.gcube.portal.databook.server.DBCassandraAstyanaxImpl;
import org.gcube.portal.databook.server.DatabookStore;
import org.gcube.portal.databook.shared.Attachment;
import org.gcube.portal.databook.shared.Comment;
import org.gcube.portal.databook.shared.EnhancedFeed;
import org.gcube.portal.databook.shared.Feed;
import org.gcube.portal.databook.shared.FeedType;
import org.gcube.socialnetworking.social_data_indexing_common.ex.BulkInsertionFailedException;
import org.gcube.socialnetworking.social_data_indexing_common.utils.ElasticSearchRunningCluster;
import org.gcube.socialnetworking.social_data_indexing_common.utils.IndexFields;
@ -139,7 +142,7 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
for (String vreID : vreIds) {
try{
List<Feed> feeds = store.getAllFeedsByVRE(vreID);
addEnhancedFeedsInBulk(feeds);
addEnhancedFeedsInBulk(feeds, init);
logger.debug("Number of indexed feeds is " + feeds.size() + " for vre " + vreID);
}catch(Exception e){
logger.debug("Exception while saving feeds/comments into the index for vre " + vreID, e);
@ -148,6 +151,9 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
}
// delete documents with timestamp less than init
deleteDocumentWithTimestampLowerThan(init);
long end = System.currentTimeMillis();
logger.debug("Synchronization thread ends running. It took " + (end - init) + " milliseconds " +
" that is " + (double)(end - init)/(1000.0 * 60.0) + " minutes.");
@ -172,9 +178,10 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
/**
* Add feeds into the elasticsearch index.
* @param feeds
* @param init is the timestamp that will be put in the document
* @throws BulkInsertionFailedException
*/
private void addEnhancedFeedsInBulk(List<Feed> feeds) throws BulkInsertionFailedException {
private void addEnhancedFeedsInBulk(List<Feed> feeds, long init) throws BulkInsertionFailedException {
logger.debug("Starting bulk insert enhanced feeds operation");
BulkProcessor bulkProcessor = BulkProcessor.builder(
client,
@ -223,19 +230,15 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
// save feeds
for (Feed feed: feeds) {
// skip disabled feeds but delete from the index (they could be present)
if(feed.getType().equals(FeedType.DISABLED)){
deleteDocument(feed.getKey());
continue;
}
String enhFeedUUID = null;
try{
// enhance and convert
String json = enhanceAndConvertToJson(feed);
enhFeedUUID = feed.getKey();
IndexRequest ind = new IndexRequest(IndexFields.INDEX_NAME, IndexFields.EF_FEEDS_TABLE, enhFeedUUID)
.source(json);
.timestamp(Long.toString(init)) // set timestamp
.source(json); // add json object
bulkProcessor.add(ind);
}catch(Exception e){
logger.error("Skipping insert feed with id " + enhFeedUUID, e);
@ -292,13 +295,48 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
* @param docID
* @return
*/
private boolean deleteDocument(String docID) {
if(docID == null || docID.isEmpty())
return false;
logger.debug("Removing doc with id " + docID);
DeleteResponse response = client.prepareDelete(IndexFields.INDEX_NAME, IndexFields.EF_FEEDS_TABLE, docID).get();
logger.debug("doc found? " + response.isFound());
return response.isFound();
public void deleteDocumentWithTimestampLowerThan(long timestamp) {
logger.debug("Removing docs with timestamp lower than " + timestamp);
// query on timestamp field
BoolQueryBuilder filter = QueryBuilders.boolQuery();
filter.must(QueryBuilders.matchAllQuery());
filter.filter(QueryBuilders.rangeQuery("_timestamp").gte(0).lt(timestamp));
logger.debug(filter.toString());
SearchResponse scrollResp = client.prepareSearch(IndexFields.INDEX_NAME)
.setSize(100)
.setScroll(new TimeValue(60000))
.setQuery(filter)
.execute()
.actionGet();
int deleteDocs = 0;
//Scroll until no hits are returned
while (true) {
for (SearchHit hit : scrollResp.getHits().getHits()) {
String docID = hit.getId();
DeleteResponse response = client.prepareDelete(IndexFields.INDEX_NAME, IndexFields.EF_FEEDS_TABLE, docID).get();
logger.debug("deleting doc with id = " + docID + "...found? " + response.isFound()); // found of course..
if(response.isFound())
deleteDocs ++;
}
scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(60000)).execute().actionGet();
//Break condition: No hits are returned
if (scrollResp.getHits().getHits().length == 0) {
logger.debug("No more hits to delete");
break;
}
}
logger.debug("Number of delete documents is " + deleteDocs);
}
/**{@inheritDoc}*/