Documents older than a certain timestamp will be now automatically removed after indexing of newer posts (or updates of them)
git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/social-networking/social-data-indexer-se-plugin@124391 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
parent
bc2bb8d0a9
commit
983ee1d6c4
|
@ -17,6 +17,7 @@ import org.elasticsearch.action.bulk.BulkRequest;
|
|||
import org.elasticsearch.action.bulk.BulkResponse;
|
||||
import org.elasticsearch.action.delete.DeleteResponse;
|
||||
import org.elasticsearch.action.index.IndexRequest;
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.client.transport.NoNodeAvailableException;
|
||||
import org.elasticsearch.client.transport.TransportClient;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -24,13 +25,15 @@ import org.elasticsearch.common.transport.InetSocketTransportAddress;
|
|||
import org.elasticsearch.common.unit.ByteSizeUnit;
|
||||
import org.elasticsearch.common.unit.ByteSizeValue;
|
||||
import org.elasticsearch.common.unit.TimeValue;
|
||||
import org.elasticsearch.index.query.BoolQueryBuilder;
|
||||
import org.elasticsearch.index.query.QueryBuilders;
|
||||
import org.elasticsearch.search.SearchHit;
|
||||
import org.gcube.portal.databook.server.DBCassandraAstyanaxImpl;
|
||||
import org.gcube.portal.databook.server.DatabookStore;
|
||||
import org.gcube.portal.databook.shared.Attachment;
|
||||
import org.gcube.portal.databook.shared.Comment;
|
||||
import org.gcube.portal.databook.shared.EnhancedFeed;
|
||||
import org.gcube.portal.databook.shared.Feed;
|
||||
import org.gcube.portal.databook.shared.FeedType;
|
||||
import org.gcube.socialnetworking.social_data_indexing_common.ex.BulkInsertionFailedException;
|
||||
import org.gcube.socialnetworking.social_data_indexing_common.utils.ElasticSearchRunningCluster;
|
||||
import org.gcube.socialnetworking.social_data_indexing_common.utils.IndexFields;
|
||||
|
@ -139,7 +142,7 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
|
|||
for (String vreID : vreIds) {
|
||||
try{
|
||||
List<Feed> feeds = store.getAllFeedsByVRE(vreID);
|
||||
addEnhancedFeedsInBulk(feeds);
|
||||
addEnhancedFeedsInBulk(feeds, init);
|
||||
logger.debug("Number of indexed feeds is " + feeds.size() + " for vre " + vreID);
|
||||
}catch(Exception e){
|
||||
logger.debug("Exception while saving feeds/comments into the index for vre " + vreID, e);
|
||||
|
@ -148,6 +151,9 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
|
|||
|
||||
}
|
||||
|
||||
// delete documents with timestamp less than init
|
||||
deleteDocumentWithTimestampLowerThan(init);
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
logger.debug("Synchronization thread ends running. It took " + (end - init) + " milliseconds " +
|
||||
" that is " + (double)(end - init)/(1000.0 * 60.0) + " minutes.");
|
||||
|
@ -172,9 +178,10 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
|
|||
/**
|
||||
* Add feeds into the elasticsearch index.
|
||||
* @param feeds
|
||||
* @param init is the timestamp that will be put in the document
|
||||
* @throws BulkInsertionFailedException
|
||||
*/
|
||||
private void addEnhancedFeedsInBulk(List<Feed> feeds) throws BulkInsertionFailedException {
|
||||
private void addEnhancedFeedsInBulk(List<Feed> feeds, long init) throws BulkInsertionFailedException {
|
||||
logger.debug("Starting bulk insert enhanced feeds operation");
|
||||
BulkProcessor bulkProcessor = BulkProcessor.builder(
|
||||
client,
|
||||
|
@ -223,19 +230,15 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
|
|||
// save feeds
|
||||
for (Feed feed: feeds) {
|
||||
|
||||
// skip disabled feeds but delete from the index (they could be present)
|
||||
if(feed.getType().equals(FeedType.DISABLED)){
|
||||
deleteDocument(feed.getKey());
|
||||
continue;
|
||||
}
|
||||
|
||||
String enhFeedUUID = null;
|
||||
try{
|
||||
// enhance and convert
|
||||
String json = enhanceAndConvertToJson(feed);
|
||||
enhFeedUUID = feed.getKey();
|
||||
IndexRequest ind = new IndexRequest(IndexFields.INDEX_NAME, IndexFields.EF_FEEDS_TABLE, enhFeedUUID)
|
||||
.source(json);
|
||||
.timestamp(Long.toString(init)) // set timestamp
|
||||
.source(json); // add json object
|
||||
|
||||
bulkProcessor.add(ind);
|
||||
}catch(Exception e){
|
||||
logger.error("Skipping insert feed with id " + enhFeedUUID, e);
|
||||
|
@ -292,13 +295,48 @@ public class SocialDataIndexerPlugin extends Plugin<SocialDataIndexerPluginDecla
|
|||
* @param docID
|
||||
* @return
|
||||
*/
|
||||
private boolean deleteDocument(String docID) {
|
||||
if(docID == null || docID.isEmpty())
|
||||
return false;
|
||||
logger.debug("Removing doc with id " + docID);
|
||||
DeleteResponse response = client.prepareDelete(IndexFields.INDEX_NAME, IndexFields.EF_FEEDS_TABLE, docID).get();
|
||||
logger.debug("doc found? " + response.isFound());
|
||||
return response.isFound();
|
||||
public void deleteDocumentWithTimestampLowerThan(long timestamp) {
|
||||
|
||||
logger.debug("Removing docs with timestamp lower than " + timestamp);
|
||||
|
||||
// query on timestamp field
|
||||
BoolQueryBuilder filter = QueryBuilders.boolQuery();
|
||||
filter.must(QueryBuilders.matchAllQuery());
|
||||
filter.filter(QueryBuilders.rangeQuery("_timestamp").gte(0).lt(timestamp));
|
||||
|
||||
logger.debug(filter.toString());
|
||||
|
||||
SearchResponse scrollResp = client.prepareSearch(IndexFields.INDEX_NAME)
|
||||
.setSize(100)
|
||||
.setScroll(new TimeValue(60000))
|
||||
.setQuery(filter)
|
||||
.execute()
|
||||
.actionGet();
|
||||
|
||||
int deleteDocs = 0;
|
||||
|
||||
//Scroll until no hits are returned
|
||||
while (true) {
|
||||
|
||||
for (SearchHit hit : scrollResp.getHits().getHits()) {
|
||||
|
||||
String docID = hit.getId();
|
||||
DeleteResponse response = client.prepareDelete(IndexFields.INDEX_NAME, IndexFields.EF_FEEDS_TABLE, docID).get();
|
||||
logger.debug("deleting doc with id = " + docID + "...found? " + response.isFound()); // found of course..
|
||||
if(response.isFound())
|
||||
deleteDocs ++;
|
||||
|
||||
}
|
||||
|
||||
scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(60000)).execute().actionGet();
|
||||
//Break condition: No hits are returned
|
||||
if (scrollResp.getHits().getHits().length == 0) {
|
||||
logger.debug("No more hits to delete");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug("Number of delete documents is " + deleteDocs);
|
||||
}
|
||||
|
||||
/**{@inheritDoc}*/
|
||||
|
|
Loading…
Reference in New Issue