Fixed ID mapping in oai-pmh harvester
This commit is contained in:
parent
e186223556
commit
f45de081d1
|
@ -3,7 +3,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>org.gcube.data-publishing.gCat-Feeder</groupId>
|
<groupId>org.gcube.data-publishing.gCat-Feeder</groupId>
|
||||||
<artifactId>gCat-Feeder-Suite</artifactId>
|
<artifactId>gCat-Feeder-Suite</artifactId>
|
||||||
<version>${gFeedSuiteVersion}</version>
|
<version>${gFeedSuiteVersion}</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>oai-harvester</artifactId>
|
<artifactId>oai-harvester</artifactId>
|
||||||
<name>oai-harvester</name>
|
<name>oai-harvester</name>
|
||||||
|
|
|
@ -8,12 +8,9 @@ import javax.ws.rs.client.Client;
|
||||||
import javax.ws.rs.client.ClientBuilder;
|
import javax.ws.rs.client.ClientBuilder;
|
||||||
import javax.ws.rs.client.WebTarget;
|
import javax.ws.rs.client.WebTarget;
|
||||||
import javax.ws.rs.core.Response;
|
import javax.ws.rs.core.Response;
|
||||||
import javax.xml.bind.JAXB;
|
|
||||||
import javax.xml.bind.JAXBContext;
|
import javax.xml.bind.JAXBContext;
|
||||||
import javax.xml.bind.JAXBElement;
|
|
||||||
import javax.xml.bind.JAXBException;
|
import javax.xml.bind.JAXBException;
|
||||||
import javax.xml.bind.Unmarshaller;
|
import javax.xml.bind.Unmarshaller;
|
||||||
import javax.xml.transform.stream.StreamSource;
|
|
||||||
|
|
||||||
import org.gcube.data.publishing.gFeed.collectors.oai.model.CommunicationException;
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.CommunicationException;
|
||||||
import org.gcube.data.publishing.gFeed.collectors.oai.model.DCRecordMetadata;
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.DCRecordMetadata;
|
||||||
|
@ -27,6 +24,7 @@ import org.glassfish.jersey.client.ClientProperties;
|
||||||
|
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
|
@ -55,7 +53,11 @@ public class OAIClient {
|
||||||
@NonNull
|
@NonNull
|
||||||
private String baseUrl;
|
private String baseUrl;
|
||||||
|
|
||||||
|
@NonNull
|
||||||
|
@Setter
|
||||||
|
private Integer maxItems=-1;
|
||||||
|
|
||||||
|
|
||||||
Client client;
|
Client client;
|
||||||
|
|
||||||
private synchronized Client getWebClient() {
|
private synchronized Client getWebClient() {
|
||||||
|
@ -105,6 +107,16 @@ public class OAIClient {
|
||||||
resumptionToken=t.getId();
|
resumptionToken=t.getId();
|
||||||
}else isComplete=true; //no token = completion
|
}else isComplete=true; //no token = completion
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//Using limit
|
||||||
|
if(maxItems>0 && toReturn.size()>=maxItems) {
|
||||||
|
log.warn("MAX ITEMS LIMIT REACHED : "+toReturn.size()+" / "+maxItems);
|
||||||
|
isComplete=true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// }catch(CommunicationException e) {
|
// }catch(CommunicationException e) {
|
||||||
// log.warn("Received communication error "+e.getMessage());
|
// log.warn("Received communication error "+e.getMessage());
|
||||||
// log.debug("Current attempt number = "+currentAttempt," max attempt Number = "+MAX_ATTEMPTS+", attempts delay factor = ");
|
// log.debug("Current attempt number = "+currentAttempt," max attempt Number = "+MAX_ATTEMPTS+", attempts delay factor = ");
|
||||||
|
|
|
@ -64,7 +64,7 @@ public class GCatTransformer implements DataTransformer<GCatModel,OAIRecord>{
|
||||||
CkanItem item=new CkanItem();
|
CkanItem item=new CkanItem();
|
||||||
//escaping name chars
|
//escaping name chars
|
||||||
String toSetName=toTranslate.getHeader().getIdentifier();
|
String toSetName=toTranslate.getHeader().getIdentifier();
|
||||||
toSetName=toSetName.replaceAll("[^a-z0-9_\\\\-]", "_");
|
toSetName=toSetName.toLowerCase().replaceAll("[^a-z0-9_\\\\-]", "_");
|
||||||
item.setName(toSetName);
|
item.setName(toSetName);
|
||||||
OAIMetadata meta=toTranslate.getMetadata().getMetadata();
|
OAIMetadata meta=toTranslate.getMetadata().getMetadata();
|
||||||
if(meta instanceof DCRecordMetadata) {
|
if(meta instanceof DCRecordMetadata) {
|
||||||
|
|
|
@ -1,29 +1,146 @@
|
||||||
package org.gcube.application.gfeed.oai;
|
package org.gcube.application.gfeed.oai;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map.Entry;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import javax.xml.bind.JAXBContext;
|
||||||
import javax.xml.bind.JAXBException;
|
import javax.xml.bind.JAXBException;
|
||||||
|
import javax.xml.bind.Marshaller;
|
||||||
|
import javax.xml.bind.Unmarshaller;
|
||||||
|
|
||||||
import org.gcube.data.publishing.gFeed.collectors.oai.OAIClient;
|
import org.gcube.data.publishing.gFeed.collectors.oai.OAIClient;
|
||||||
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.DCRecordMetadata;
|
||||||
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.MetadataHolder;
|
||||||
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIInteractionException;
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIInteractionException;
|
||||||
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIMetadata;
|
||||||
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIRecord;
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIRecord;
|
||||||
import org.gcube.data.publishing.gFeed.collectors.oai.model.ckan.GCatModel;
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAI_PMH;
|
||||||
import org.gcube.data.publishing.gFeed.collectors.oai.model.ckan.GCatTransformer;
|
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAI_PMH.ListRecords;
|
||||||
|
|
||||||
public class OAIClientTests {
|
public class OAIClientTests {
|
||||||
|
|
||||||
|
|
||||||
public static void main (String[] args) throws JAXBException, OAIInteractionException {
|
public static void main (String[] args) throws JAXBException, OAIInteractionException, IOException {
|
||||||
|
|
||||||
|
Collection <OAIRecord> records;
|
||||||
|
|
||||||
|
// FROM URL
|
||||||
String baseUrl="https://data.inrae.fr/oai";
|
String baseUrl="https://data.inrae.fr/oai";
|
||||||
OAIClient client=new OAIClient(baseUrl);
|
// String baseUrl="http://researchdata.cab.unipd.it/cgi/oai2";
|
||||||
Collection <OAIRecord> records=client.getAll(OAIClient.DC_METADATA_PREFIX);
|
Integer maxItems=-1;
|
||||||
|
records=fromURL(baseUrl,maxItems);
|
||||||
|
System.out.println("Serialized to : "+serialize(records));
|
||||||
|
|
||||||
|
|
||||||
|
//FROM PATH
|
||||||
|
// String path="/var/folders/21/65t9t22j3l3fk7s18m0t77rh0000gp/T/oai-records7470133448784207387.xml";
|
||||||
|
// records=read(path);
|
||||||
|
|
||||||
|
|
||||||
System.out.println("Records size = "+records.size());
|
System.out.println("Records size = "+records.size());
|
||||||
|
|
||||||
|
//Returned IDS check
|
||||||
|
System.out.println("Checking IDS ..");
|
||||||
|
HashSet<String> codes=new HashSet<String>();
|
||||||
|
for(OAIRecord r:records) {
|
||||||
|
String code=r.getHeader().getIdentifier();
|
||||||
|
if(codes.contains(code)) {
|
||||||
|
System.out.println("Found duplicate id : "+code);
|
||||||
|
}
|
||||||
|
codes.add(code);
|
||||||
|
}
|
||||||
|
|
||||||
GCatTransformer tr=new GCatTransformer();
|
System.out.println("IDS size is : "+codes.size());
|
||||||
tr.transform(records);
|
|
||||||
|
|
||||||
|
// Checking mapping
|
||||||
|
|
||||||
|
HashMap<String,Set<String>> collisions=new HashMap<>();
|
||||||
|
for(OAIRecord r:records) {
|
||||||
|
String toFix=r.getHeader().getIdentifier();
|
||||||
|
String fixed=toFix.toLowerCase().replaceAll("[^a-z0-9_\\\\-]", "_");
|
||||||
|
if(!collisions.containsKey(fixed))
|
||||||
|
collisions.put(fixed, new HashSet<String>());
|
||||||
|
collisions.get(fixed).add(toFix);
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("Got "+collisions.size()+" unique ckan ids, listing collisions");
|
||||||
|
String maxCollided=null;
|
||||||
|
int maxCollisions=0;
|
||||||
|
|
||||||
|
for(Entry<String,Set<String>> entry:collisions.entrySet()) {
|
||||||
|
if(entry.getValue().size()>1)
|
||||||
|
System.out.println("Collision from "+entry.getKey()+" to "+entry.getValue());
|
||||||
|
if(entry.getValue().size()>maxCollisions) {
|
||||||
|
maxCollisions=entry.getValue().size();
|
||||||
|
maxCollided=entry.getKey();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("Top hits from "+maxCollided+" to "+collisions.get(maxCollided));
|
||||||
|
|
||||||
|
//Use transofrmer
|
||||||
|
// GCatTransformer tr=new GCatTransformer();
|
||||||
|
// Set<GCatModel> translated=tr.transform(records);
|
||||||
|
// System.out.println("Trasnlated size = "+translated.size());
|
||||||
|
//
|
||||||
|
// HashSet<String> ids =new HashSet<String>();
|
||||||
|
// for(GCatModel model : translated)
|
||||||
|
// ids.add(model.getItem().getName());
|
||||||
|
//
|
||||||
|
// System.out.println("IDS size = "+ids.size());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static final Collection<OAIRecord> fromURL(String url,Integer maxItems) throws JAXBException, OAIInteractionException {
|
||||||
|
OAIClient client=new OAIClient(url);
|
||||||
|
client.setMaxItems(maxItems);
|
||||||
|
return client.getAll(OAIClient.DC_METADATA_PREFIX);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private static String serialize (Collection<OAIRecord> coll) throws JAXBException, IOException {
|
||||||
|
OAI_PMH toWrite=new OAI_PMH();
|
||||||
|
toWrite.setResponseRecords(new ListRecords(new ArrayList<OAIRecord>(coll),null));
|
||||||
|
|
||||||
|
|
||||||
|
JAXBContext jaxbContext = JAXBContext.newInstance(OAIRecord.class,
|
||||||
|
MetadataHolder.class,
|
||||||
|
OAIMetadata.class,
|
||||||
|
DCRecordMetadata.class,
|
||||||
|
OAI_PMH.class);
|
||||||
|
|
||||||
|
File outF=File.createTempFile("oai-records", ".xml");
|
||||||
|
PrintWriter out = new PrintWriter(outF);
|
||||||
|
|
||||||
|
Marshaller m=jaxbContext.createMarshaller();
|
||||||
|
// for(OAIRecord r:toWrite)
|
||||||
|
m.marshal(toWrite, out);
|
||||||
|
|
||||||
|
out.flush();
|
||||||
|
out.close();
|
||||||
|
return outF.getAbsolutePath();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Collection<OAIRecord> read(String file) throws JAXBException{
|
||||||
|
JAXBContext jaxbContext = JAXBContext.newInstance(OAIRecord.class,
|
||||||
|
MetadataHolder.class,
|
||||||
|
OAIMetadata.class,
|
||||||
|
DCRecordMetadata.class,
|
||||||
|
OAI_PMH.class);
|
||||||
|
|
||||||
|
Unmarshaller u=jaxbContext.createUnmarshaller();
|
||||||
|
OAI_PMH parsed=(OAI_PMH) u.unmarshal(new File(file));
|
||||||
|
return parsed.getResponseRecords().getRecords();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
<configuration debug="true">
|
||||||
|
|
||||||
|
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
|
||||||
|
<!-- encoders are assigned the type
|
||||||
|
ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
|
||||||
|
<encoder>
|
||||||
|
<pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
|
||||||
|
</encoder>
|
||||||
|
</appender>
|
||||||
|
|
||||||
|
<root level="DEBUG">
|
||||||
|
<appender-ref ref="STDOUT" />
|
||||||
|
</root>
|
||||||
|
</configuration>
|
2
pom.xml
2
pom.xml
|
@ -18,7 +18,7 @@
|
||||||
<properties>
|
<properties>
|
||||||
<distroDirectory>distro</distroDirectory>
|
<distroDirectory>distro</distroDirectory>
|
||||||
<gitBaseUrl>https://code-repo.d4science.org/gCubeSystem</gitBaseUrl>
|
<gitBaseUrl>https://code-repo.d4science.org/gCubeSystem</gitBaseUrl>
|
||||||
<gFeedSuiteVersion>1.0.2</gFeedSuiteVersion>
|
<gFeedSuiteVersion>1.0.3-SNAPSHOT</gFeedSuiteVersion>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<scm>
|
<scm>
|
||||||
|
|
Loading…
Reference in New Issue