gFeed/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java

183 lines
5.3 KiB
Java
Raw Normal View History

2020-05-07 17:43:22 +02:00
package org.gcube.data.publishing.gFeed.collectors.oai;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
2020-12-16 11:29:12 +01:00
import java.util.List;
2020-05-07 17:43:22 +02:00
import javax.ws.rs.client.Client;
import javax.ws.rs.client.ClientBuilder;
import javax.ws.rs.client.WebTarget;
import javax.ws.rs.core.Response;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
2020-05-14 18:04:29 +02:00
import org.gcube.data.publishing.gFeed.collectors.oai.model.CommunicationException;
2020-05-07 17:43:22 +02:00
import org.gcube.data.publishing.gFeed.collectors.oai.model.DCRecordMetadata;
import org.gcube.data.publishing.gFeed.collectors.oai.model.MetadataHolder;
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIInteractionException;
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIMetadata;
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIRecord;
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAI_PMH;
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAI_PMH.Token;
import org.glassfish.jersey.client.ClientProperties;
2020-12-16 11:29:12 +01:00
import lombok.Getter;
2020-05-07 17:43:22 +02:00
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
2020-05-27 10:39:49 +02:00
import lombok.Setter;
2020-05-07 17:43:22 +02:00
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
public class OAIClient {
private static JAXBContext jaxbContext=null;
2020-05-14 18:04:29 +02:00
private static final int MAX_ATTEMPTS=3;
private static final long DELAY_FACTOR=1000;
2020-12-16 11:29:12 +01:00
2020-05-07 17:43:22 +02:00
private static synchronized JAXBContext getContext() throws JAXBException {
if(jaxbContext==null)
2020-05-14 18:04:29 +02:00
jaxbContext = JAXBContext.newInstance(OAIRecord.class,
MetadataHolder.class,
OAIMetadata.class,
DCRecordMetadata.class,
OAI_PMH.class);
2020-05-07 17:43:22 +02:00
return jaxbContext;
}
2020-05-14 18:04:29 +02:00
2020-05-07 17:43:22 +02:00
public static final String DC_METADATA_PREFIX="oai_dc";
2020-05-14 18:04:29 +02:00
2020-05-07 17:43:22 +02:00
@NonNull
private String baseUrl;
2020-05-14 18:04:29 +02:00
2020-05-27 10:39:49 +02:00
@NonNull
@Setter
private Integer maxItems=-1;
2020-12-16 11:29:12 +01:00
2020-05-07 17:43:22 +02:00
Client client;
private synchronized Client getWebClient() {
if(client==null) {
client = ClientBuilder.newClient()
.property(ClientProperties.SUPPRESS_HTTP_COMPLIANCE_VALIDATION, true);
}
return client;
}
2020-12-16 11:29:12 +01:00
@Getter
private List<String> specifiedSets=new ArrayList<String>();
2020-05-07 17:43:22 +02:00
public Collection<OAIRecord> getAll(String metadataPrefix) throws JAXBException, OAIInteractionException{
ArrayList<OAIRecord> toReturn=new ArrayList<OAIRecord>();
2020-12-16 11:29:12 +01:00
WebTarget target=getWebClient().target(baseUrl).queryParam("verb","ListRecords");
if(!specifiedSets.isEmpty())
for(String set : specifiedSets) {
log.info("Loading "+metadataPrefix+" SET : "+set+" from "+baseUrl);
target.queryParam("set", set);
2022-11-04 13:54:37 +01:00
toReturn.addAll(call(target.queryParam("set", set),metadataPrefix));
2020-12-16 11:29:12 +01:00
}
else {
log.info("Loading "+metadataPrefix+" from "+baseUrl);
toReturn.addAll(call(target,metadataPrefix));
}
log.info("Obtained "+toReturn.size()+" from "+baseUrl);
return toReturn;
}
private List<OAIRecord> call(WebTarget target,String metadataPrefix){
ArrayList<OAIRecord> toReturn=new ArrayList<OAIRecord>();
2020-05-07 17:43:22 +02:00
2022-11-04 14:35:12 +01:00
log.info("Harvesting from resulting url {} ",target.getUri());
2020-05-07 17:43:22 +02:00
String resumptionToken=null;
2020-05-14 18:04:29 +02:00
2020-05-07 17:43:22 +02:00
// call & iterate
boolean isComplete=false;
2020-05-14 18:04:29 +02:00
int currentAttempt=1;
2020-05-07 17:43:22 +02:00
while(!isComplete) {
2020-05-14 18:04:29 +02:00
try {
2022-11-04 14:35:12 +01:00
if(resumptionToken==null) {
2020-05-14 18:04:29 +02:00
target=target.queryParam("metadataPrefix",metadataPrefix);
2022-11-04 14:35:12 +01:00
}
else {
2020-05-14 18:04:29 +02:00
target=target.queryParam("resumptionToken", resumptionToken);
2022-11-04 14:35:12 +01:00
}
2020-05-14 18:04:29 +02:00
2022-11-14 12:26:35 +01:00
log.trace("Calling {} ",target.getUri());
2022-11-04 14:35:12 +01:00
Response resp=target.request("application/xml").get();
2020-12-16 11:29:12 +01:00
2020-05-14 18:04:29 +02:00
2020-12-16 11:29:12 +01:00
OAI_PMH msg=check(resp);
2020-05-14 18:04:29 +02:00
2020-12-16 11:29:12 +01:00
if(msg.isError()) throw new OAIInteractionException(msg.getError().getCode()+ " : "+msg.getError().getMessage());
//No errors, thus reset attempt counter
currentAttempt=1;
2020-05-14 18:04:29 +02:00
2022-11-14 12:26:35 +01:00
if(msg.getResponseRecords().getRecords()!=null)
toReturn.addAll(msg.getResponseRecords().getRecords());
else log.info("NB {} didn't returned any record",msg.getRequest().getPath());
2020-12-16 11:29:12 +01:00
log.debug("Parsed "+toReturn.size()+" records so far.");
2022-11-14 12:26:35 +01:00
Token t=msg.getResponseRecords().getResumptionToken();
2020-05-14 18:04:29 +02:00
log.debug("Obtained token : "+t);
2020-12-16 11:29:12 +01:00
2020-05-14 18:04:29 +02:00
if(t!=null && t.getId()!=null && !t.getId().isEmpty()) {
resumptionToken=t.getId();
}else isComplete=true; //no token = completion
2020-05-27 10:39:49 +02:00
//Using limit
if(maxItems>0 && toReturn.size()>=maxItems) {
log.warn("MAX ITEMS LIMIT REACHED : "+toReturn.size()+" / "+maxItems);
isComplete=true;
}
2020-05-14 18:04:29 +02:00
}catch(Throwable t) {
2022-11-14 12:26:35 +01:00
log.warn("Unexpected ERROR ",t);
2020-05-14 18:04:29 +02:00
log.debug("Current attempt number = "+currentAttempt," max attempt Number = "+MAX_ATTEMPTS+", attempts delay factor = ");
isComplete=currentAttempt>MAX_ATTEMPTS;
try {
Thread.sleep(currentAttempt*DELAY_FACTOR);
} catch (InterruptedException e1) {}
currentAttempt++;
}
2020-05-07 17:43:22 +02:00
}
return toReturn;
}
2020-05-14 18:04:29 +02:00
private static OAI_PMH check(Response resp) throws JAXBException, CommunicationException {
2020-05-07 17:43:22 +02:00
if(resp.getStatus()<200||resp.getStatus()>=300) {
// exception
2020-05-14 18:04:29 +02:00
throw new CommunicationException("Received error message. STATUS "+resp.getStatus()+ ", message : "+resp.readEntity(String.class));
2020-05-07 17:43:22 +02:00
}else {
2020-05-14 18:04:29 +02:00
2020-05-07 17:43:22 +02:00
String respString=resp.readEntity(String.class);
Unmarshaller jaxbUnmarshaller = getContext().createUnmarshaller();
OAI_PMH obj=(OAI_PMH) jaxbUnmarshaller.unmarshal(new StringReader(respString));
2020-05-14 18:04:29 +02:00
2020-05-07 17:43:22 +02:00
return obj;
2020-05-14 18:04:29 +02:00
// OAI_PMH response = (OAI_PMH) jaxbUnmarshaller.unmarshal(
// new StreamSource(new StringReader(respString)));
2020-05-07 17:43:22 +02:00
}
}
}