enrichment queries set is now splitted on workflow node side, because of timeout error on very large collection
This commit is contained in:
parent
4673c143ce
commit
b8d0e3f741
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.ariadneplus.workflows.nodes;
|
package eu.dnetlib.ariadneplus.workflows.nodes;
|
||||||
|
|
||||||
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
@ -12,6 +13,7 @@ import eu.dnetlib.msro.workflows.util.ResultsetProgressProvider;
|
||||||
import eu.dnetlib.msro.workflows.util.WorkflowsConstants;
|
import eu.dnetlib.msro.workflows.util.WorkflowsConstants;
|
||||||
import eu.dnetlib.rmi.common.ResultSet;
|
import eu.dnetlib.rmi.common.ResultSet;
|
||||||
import eu.dnetlib.rmi.manager.MSROException;
|
import eu.dnetlib.rmi.manager.MSROException;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.http.NameValuePair;
|
import org.apache.http.NameValuePair;
|
||||||
|
@ -44,59 +46,47 @@ public class EnrichGraphDBContentJobNode extends AsyncJobNode {
|
||||||
private String datasource;
|
private String datasource;
|
||||||
|
|
||||||
//for parallel requests to the publisher endpoint
|
//for parallel requests to the publisher endpoint
|
||||||
private int nThreads = 5;
|
private int nThreads = 1;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String execute(final Env env) throws Exception {
|
protected String execute(final Env env) throws Exception {
|
||||||
|
|
||||||
int statusCode = -1;
|
int statusCode = -1;
|
||||||
String enrichResult = "noResult";
|
String enrichResult = "";
|
||||||
log.info("Publisher endpoint: " + getPublisherEndpoint());
|
|
||||||
log.info("Enrich Query Value: " + getSparqlUpdateQuery());
|
|
||||||
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
|
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
|
||||||
cm.setMaxTotal(nThreads);
|
cm.setMaxTotal(nThreads);
|
||||||
CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build();
|
CloseableHttpClient client = HttpClients.custom().setConnectionManager(cm).build();
|
||||||
|
|
||||||
log.info("Enrich endpoint: " + getEnrichEndpoint());
|
log.info("Enrich endpoint: " + getEnrichEndpoint());
|
||||||
CloseableHttpResponse responsePOST = null;
|
CloseableHttpResponse responsePOST = null;
|
||||||
try {
|
String queryValue = getSparqlUpdateQuery();
|
||||||
HttpPost post = new HttpPost(getEnrichEndpoint());
|
int countQueries = 0;
|
||||||
List<NameValuePair> params = Lists.newArrayList();
|
int countSuccess = 0;
|
||||||
String datasourceInterfaceValue = getDatasourceInterface();
|
String endpoint = getEnrichEndpoint();
|
||||||
StringEntity entity = new StringEntity(getSparqlUpdateQuery());
|
for(String query : Splitter.on(";").split(queryValue)){
|
||||||
post.setEntity(entity);
|
if (StringUtils.isNoneBlank(query)) {
|
||||||
responsePOST = client.execute(post);
|
countQueries++;
|
||||||
statusCode = responsePOST.getStatusLine().getStatusCode();
|
HttpPost post = new HttpPost(endpoint);
|
||||||
switch (statusCode) {
|
StringEntity entity = new StringEntity(query);
|
||||||
case 200:
|
post.setEntity(entity);
|
||||||
log.info("enrich graphDB content completed");
|
responsePOST = client.execute(post);
|
||||||
break;
|
statusCode = responsePOST.getStatusLine().getStatusCode();
|
||||||
default:
|
switch (statusCode) {
|
||||||
log.error("error enriching graphDB " + responsePOST.getStatusLine().getStatusCode() + ": " + responsePOST.getStatusLine().getReasonPhrase());
|
case 200:
|
||||||
break;
|
log.info(String.format("Query %d executed: %s", countQueries, query));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
log.error("error enriching graphDB " + responsePOST.getStatusLine().getStatusCode() + ": " + responsePOST.getStatusLine().getReasonPhrase());
|
||||||
|
throw new MSROException(String.format("Cannot execute sparql from %s", query));
|
||||||
|
}
|
||||||
|
countSuccess++;
|
||||||
}
|
}
|
||||||
} catch (ConnectException ce) {
|
|
||||||
log.error(ce);
|
|
||||||
throw new MSROException("Unable to connect to Publisher endpoint" + getEnrichEndpoint());
|
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
enrichResult = String.format("Queries committed with success %d/%d", countSuccess, countQueries);
|
||||||
log.error(e);
|
|
||||||
throw new MSROException("IO Error" + getEnrichEndpoint());
|
|
||||||
}
|
|
||||||
finally{
|
|
||||||
if(responsePOST != null) responsePOST.close();
|
|
||||||
client.close();
|
|
||||||
cm.shutdown();
|
|
||||||
}
|
|
||||||
|
|
||||||
env.setAttribute(WorkflowsConstants.MAIN_LOG_PREFIX + "statusCode", Integer.toString(statusCode));
|
|
||||||
env.setAttribute(WorkflowsConstants.MAIN_LOG_PREFIX + "enrichResult", enrichResult);
|
|
||||||
|
|
||||||
log.info(enrichResult);
|
log.info(enrichResult);
|
||||||
if (statusCode!=200) {
|
if(responsePOST != null) responsePOST.close();
|
||||||
throw new MSROException("Error from Publisher endpoint [ status code: " + statusCode + " ]");
|
client.close();
|
||||||
}
|
cm.shutdown();
|
||||||
|
env.setAttribute(WorkflowsConstants.MAIN_LOG_PREFIX + "enrichResult", enrichResult);
|
||||||
return Arc.DEFAULT_ARC;
|
return Arc.DEFAULT_ARC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue