enrichment queries set is now splitted on workflow node side, because of timeout error on very large collection

This commit is contained in:
Enrico Ottonello 2021-01-07 23:40:47 +01:00
parent 4673c143ce
commit b8d0e3f741
1 changed files with 30 additions and 40 deletions

View File

@ -1,5 +1,6 @@
package eu.dnetlib.ariadneplus.workflows.nodes; package eu.dnetlib.ariadneplus.workflows.nodes;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.google.gson.Gson; import com.google.gson.Gson;
@ -12,6 +13,7 @@ import eu.dnetlib.msro.workflows.util.ResultsetProgressProvider;
import eu.dnetlib.msro.workflows.util.WorkflowsConstants; import eu.dnetlib.msro.workflows.util.WorkflowsConstants;
import eu.dnetlib.rmi.common.ResultSet; import eu.dnetlib.rmi.common.ResultSet;
import eu.dnetlib.rmi.manager.MSROException; import eu.dnetlib.rmi.manager.MSROException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.http.NameValuePair; import org.apache.http.NameValuePair;
@ -44,59 +46,47 @@ public class EnrichGraphDBContentJobNode extends AsyncJobNode {
private String datasource; private String datasource;
//for parallel requests to the publisher endpoint //for parallel requests to the publisher endpoint
private int nThreads = 5; private int nThreads = 1;
@Override @Override
protected String execute(final Env env) throws Exception { protected String execute(final Env env) throws Exception {
int statusCode = -1; int statusCode = -1;
String enrichResult = "noResult"; String enrichResult = "";
log.info("Publisher endpoint: " + getPublisherEndpoint());
log.info("Enrich Query Value: " + getSparqlUpdateQuery());
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager(); PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
cm.setMaxTotal(nThreads); cm.setMaxTotal(nThreads);
CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build(); CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build();
log.info("Enrich endpoint: " + getEnrichEndpoint()); log.info("Enrich endpoint: " + getEnrichEndpoint());
CloseableHttpResponse responsePOST = null; CloseableHttpResponse responsePOST = null;
try { String queryValue = getSparqlUpdateQuery();
HttpPost post = new HttpPost(getEnrichEndpoint()); int countQueries = 0;
List<NameValuePair> params = Lists.newArrayList(); int countSuccess = 0;
String datasourceInterfaceValue = getDatasourceInterface(); String endpoint = getEnrichEndpoint();
StringEntity entity = new StringEntity(getSparqlUpdateQuery()); for(String query : Splitter.on(";").split(queryValue)){
post.setEntity(entity); if (StringUtils.isNoneBlank(query)) {
responsePOST = client.execute(post); countQueries++;
statusCode = responsePOST.getStatusLine().getStatusCode(); HttpPost post = new HttpPost(endpoint);
switch (statusCode) { StringEntity entity = new StringEntity(query);
case 200: post.setEntity(entity);
log.info("enrich graphDB content completed"); responsePOST = client.execute(post);
break; statusCode = responsePOST.getStatusLine().getStatusCode();
default: switch (statusCode) {
log.error("error enriching graphDB " + responsePOST.getStatusLine().getStatusCode() + ": " + responsePOST.getStatusLine().getReasonPhrase()); case 200:
break; log.info(String.format("Query %d executed: %s", countQueries, query));
break;
default:
log.error("error enriching graphDB " + responsePOST.getStatusLine().getStatusCode() + ": " + responsePOST.getStatusLine().getReasonPhrase());
throw new MSROException(String.format("Cannot execute sparql from %s", query));
}
countSuccess++;
} }
} catch (ConnectException ce) {
log.error(ce);
throw new MSROException("Unable to connect to Publisher endpoint" + getEnrichEndpoint());
} }
catch (IOException e) { enrichResult = String.format("Queries committed with success %d/%d", countSuccess, countQueries);
log.error(e);
throw new MSROException("IO Error" + getEnrichEndpoint());
}
finally{
if(responsePOST != null) responsePOST.close();
client.close();
cm.shutdown();
}
env.setAttribute(WorkflowsConstants.MAIN_LOG_PREFIX + "statusCode", Integer.toString(statusCode));
env.setAttribute(WorkflowsConstants.MAIN_LOG_PREFIX + "enrichResult", enrichResult);
log.info(enrichResult); log.info(enrichResult);
if (statusCode!=200) { if(responsePOST != null) responsePOST.close();
throw new MSROException("Error from Publisher endpoint [ status code: " + statusCode + " ]"); client.close();
} cm.shutdown();
env.setAttribute(WorkflowsConstants.MAIN_LOG_PREFIX + "enrichResult", enrichResult);
return Arc.DEFAULT_ARC; return Arc.DEFAULT_ARC;
} }