dnet-core/dnet-data-services/src/main/java/eu/dnetlib/data/collector/plugins/datasets/DatasetsIterator.java

275 lines
6.4 KiB
Java

package eu.dnetlib.data.collector.plugins.datasets;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
/**
* The Class JournalIterator.
*/
public class DatasetsIterator implements Iterable<String>, Iterator<String> {
/** The logger. */
private static final Log log = LogFactory.getLog(DatasetsIterator.class);
/** The base url template. */
private static String BASE_URL_TEMPLATE = "http://ws.pangaea.de/es/pangaea/panmd/_search?_source=xml&size=%d&from=%d";
/** The journal id. */
private String journalId = "";
/** The journal name. */
private String journalName = "";
/** The journal issn. */
private String journalISSN = "";
/** The openaire datasource. */
private String openaireDatasource = "";
/** The total. */
private long total;
/** The from. */
private int from;
/** The current iterator. */
private int currentIterator;
/** The current response. */
private ElasticSearchResponse currentResponse;
/** The request. */
private RequestField request;
/** The default size. */
private static int DEFAULT_SIZE = 10;
private String projectCordaId;
private static String RECORD_TEMPLATE = "<datasetsRecord><oaf:projectid xmlns:oaf=\"http://namespace.openaire.eu/oaf\">%s</oaf:projectid>"
+ "<journal name='%s' issn='%s' datasourceid = '%s'/><metadata>%s</metadata></datasetsRecord>";
/**
* Instantiates a new journal iterator.
*
* @param request
* the request
*/
public DatasetsIterator(final RequestField request, final String projectCordaId, final PangaeaJournalInfo info) {
this.request = request;
this.setProjectCordaId(projectCordaId);
if (info != null) {
this.setJournalId(info.getJournalId());
this.setJournalName(StringEscapeUtils.escapeXml(info.getJournalName()));
this.setJournalISSN(info.getJournalISSN());
this.setOpenaireDatasource(info.getDatasourceId());
}
log.debug("Start Iterator");
}
/**
* Execute query.
*
* @param from
* the from
* @param size
* the size
* @return the string
*/
private String executeQuery(final int from, final int size) {
log.debug("executing query " + this.request.getQuery().getTerm());
log.debug(String.format("from:%d size:%d", from, size));
CloseableHttpResponse response = null;
InputStream responseBody = null;
CloseableHttpClient httpclient = HttpClients.createDefault();
try {
HttpPost post = new HttpPost(String.format(BASE_URL_TEMPLATE, size, from));
Gson g = new GsonBuilder().disableHtmlEscaping().create();
StringEntity entry = new StringEntity(g.toJson(this.request));
post.setEntity(entry);
long start = System.currentTimeMillis();
response = httpclient.execute(post);
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == 200) {
responseBody = response.getEntity().getContent();
String s = IOUtils.toString(responseBody);
log.debug("Request done in " + (System.currentTimeMillis() - start) + " ms");
responseBody.close();
return s;
}
return null;
} catch (Exception e) {
log.error("Error on executing query :" + request.getQuery().getTerm(), e);
return null;
} finally {
try {
responseBody.close();
response.close();
httpclient.close();
} catch (IOException e) {
log.error("Can't close connections gracefully", e);
}
}
}
/**
* Gets the journal id.
*
* @return the journalId
*/
public String getJournalId() {
return journalId;
}
/**
* Sets the journal id.
*
* @param journalId
* the journalId to set
*/
public void setJournalId(final String journalId) {
this.journalId = journalId;
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#hasNext()
*/
@Override
public boolean hasNext() {
return (from + currentIterator) < total;
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#next()
*/
@Override
public String next() {
String xml = String.format(RECORD_TEMPLATE, this.projectCordaId, this.journalName, this.journalISSN, this.openaireDatasource, currentResponse
.getXmlRecords().get(currentIterator));
currentIterator++;
if (currentIterator == DEFAULT_SIZE) {
getNextItem();
}
return xml;
}
/*
* (non-Javadoc)
*
* @see java.util.Iterator#remove()
*/
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/*
* (non-Javadoc)
*
* @see java.lang.Iterable#iterator()
*/
@Override
public Iterator<String> iterator() {
from = 0;
total = 0;
getNextItem();
return this;
}
/**
* Gets the next item.
*
* @return the next item
*/
private void getNextItem() {
from += currentIterator;
currentResponse = ElasticSearchResponse.createNewResponse(executeQuery(from, DEFAULT_SIZE));
total = currentResponse == null ? 0 : currentResponse.getTotal();
log.debug("from : " + from + " total of the request is " + total);
currentIterator = 0;
}
/**
* @return the projectCordaId
*/
public String getProjectCordaId() {
return projectCordaId;
}
/**
* @param projectCordaId
* the projectCordaId to set
*/
public void setProjectCordaId(final String projectCordaId) {
this.projectCordaId = projectCordaId;
}
/**
* @return the journalName
*/
public String getJournalName() {
return journalName;
}
/**
* @param journalName
* the journalName to set
*/
public void setJournalName(final String journalName) {
this.journalName = journalName;
}
/**
* @return the journalISSN
*/
public String getJournalISSN() {
return journalISSN;
}
/**
* @param journalISSN
* the journalISSN to set
*/
public void setJournalISSN(final String journalISSN) {
this.journalISSN = journalISSN;
}
/**
* @return the openaireDatasource
*/
public String getOpenaireDatasource() {
return openaireDatasource;
}
/**
* @param openaireDatasource
* the openaireDatasource to set
*/
public void setOpenaireDatasource(final String openaireDatasource) {
this.openaireDatasource = openaireDatasource;
}
}