From f45de081d15f82cfefd56c39dfbbc396597f98db Mon Sep 17 00:00:00 2001 From: FabioISTI Date: Wed, 27 May 2020 10:39:49 +0200 Subject: [PATCH] Fixed ID mapping in oai-pmh harvester --- oai-harvester/pom.xml | 2 +- .../gFeed/collectors/oai/OAIClient.java | 20 ++- .../oai/model/ckan/GCatTransformer.java | 2 +- .../application/gfeed/oai/OAIClientTests.java | 131 +++++++++++++++++- oai-harvester/src/test/resources/logback.xml | 14 ++ pom.xml | 2 +- 6 files changed, 157 insertions(+), 14 deletions(-) create mode 100644 oai-harvester/src/test/resources/logback.xml diff --git a/oai-harvester/pom.xml b/oai-harvester/pom.xml index c2ac272..bf4428c 100644 --- a/oai-harvester/pom.xml +++ b/oai-harvester/pom.xml @@ -3,7 +3,7 @@ org.gcube.data-publishing.gCat-Feeder gCat-Feeder-Suite - ${gFeedSuiteVersion} + ${gFeedSuiteVersion} oai-harvester oai-harvester diff --git a/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java b/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java index 29193fd..f87c6a7 100644 --- a/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java +++ b/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java @@ -8,12 +8,9 @@ import javax.ws.rs.client.Client; import javax.ws.rs.client.ClientBuilder; import javax.ws.rs.client.WebTarget; import javax.ws.rs.core.Response; -import javax.xml.bind.JAXB; import javax.xml.bind.JAXBContext; -import javax.xml.bind.JAXBElement; import javax.xml.bind.JAXBException; import javax.xml.bind.Unmarshaller; -import javax.xml.transform.stream.StreamSource; import org.gcube.data.publishing.gFeed.collectors.oai.model.CommunicationException; import org.gcube.data.publishing.gFeed.collectors.oai.model.DCRecordMetadata; @@ -27,6 +24,7 @@ import org.glassfish.jersey.client.ClientProperties; import lombok.NonNull; import lombok.RequiredArgsConstructor; +import lombok.Setter; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -55,7 +53,11 @@ public class OAIClient { @NonNull private String baseUrl; - + @NonNull + @Setter + private Integer maxItems=-1; + + Client client; private synchronized Client getWebClient() { @@ -105,6 +107,16 @@ public class OAIClient { resumptionToken=t.getId(); }else isComplete=true; //no token = completion + + + //Using limit + if(maxItems>0 && toReturn.size()>=maxItems) { + log.warn("MAX ITEMS LIMIT REACHED : "+toReturn.size()+" / "+maxItems); + isComplete=true; + } + + + // }catch(CommunicationException e) { // log.warn("Received communication error "+e.getMessage()); // log.debug("Current attempt number = "+currentAttempt," max attempt Number = "+MAX_ATTEMPTS+", attempts delay factor = "); diff --git a/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/model/ckan/GCatTransformer.java b/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/model/ckan/GCatTransformer.java index b8319c4..6b78300 100644 --- a/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/model/ckan/GCatTransformer.java +++ b/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/model/ckan/GCatTransformer.java @@ -64,7 +64,7 @@ public class GCatTransformer implements DataTransformer{ CkanItem item=new CkanItem(); //escaping name chars String toSetName=toTranslate.getHeader().getIdentifier(); - toSetName=toSetName.replaceAll("[^a-z0-9_\\\\-]", "_"); + toSetName=toSetName.toLowerCase().replaceAll("[^a-z0-9_\\\\-]", "_"); item.setName(toSetName); OAIMetadata meta=toTranslate.getMetadata().getMetadata(); if(meta instanceof DCRecordMetadata) { diff --git a/oai-harvester/src/test/java/org/gcube/application/gfeed/oai/OAIClientTests.java b/oai-harvester/src/test/java/org/gcube/application/gfeed/oai/OAIClientTests.java index 0a084df..7dc9e4a 100644 --- a/oai-harvester/src/test/java/org/gcube/application/gfeed/oai/OAIClientTests.java +++ b/oai-harvester/src/test/java/org/gcube/application/gfeed/oai/OAIClientTests.java @@ -1,29 +1,146 @@ package org.gcube.application.gfeed.oai; +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map.Entry; +import java.util.Set; +import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; +import javax.xml.bind.Marshaller; +import javax.xml.bind.Unmarshaller; import org.gcube.data.publishing.gFeed.collectors.oai.OAIClient; +import org.gcube.data.publishing.gFeed.collectors.oai.model.DCRecordMetadata; +import org.gcube.data.publishing.gFeed.collectors.oai.model.MetadataHolder; import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIInteractionException; +import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIMetadata; import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIRecord; -import org.gcube.data.publishing.gFeed.collectors.oai.model.ckan.GCatModel; -import org.gcube.data.publishing.gFeed.collectors.oai.model.ckan.GCatTransformer; +import org.gcube.data.publishing.gFeed.collectors.oai.model.OAI_PMH; +import org.gcube.data.publishing.gFeed.collectors.oai.model.OAI_PMH.ListRecords; public class OAIClientTests { - public static void main (String[] args) throws JAXBException, OAIInteractionException { + public static void main (String[] args) throws JAXBException, OAIInteractionException, IOException { + + Collection records; + + // FROM URL String baseUrl="https://data.inrae.fr/oai"; - OAIClient client=new OAIClient(baseUrl); - Collection records=client.getAll(OAIClient.DC_METADATA_PREFIX); +// String baseUrl="http://researchdata.cab.unipd.it/cgi/oai2"; + Integer maxItems=-1; + records=fromURL(baseUrl,maxItems); + System.out.println("Serialized to : "+serialize(records)); + + + //FROM PATH +// String path="/var/folders/21/65t9t22j3l3fk7s18m0t77rh0000gp/T/oai-records7470133448784207387.xml"; +// records=read(path); + System.out.println("Records size = "+records.size()); + //Returned IDS check + System.out.println("Checking IDS .."); + HashSet codes=new HashSet(); + for(OAIRecord r:records) { + String code=r.getHeader().getIdentifier(); + if(codes.contains(code)) { + System.out.println("Found duplicate id : "+code); + } + codes.add(code); + } - GCatTransformer tr=new GCatTransformer(); - tr.transform(records); + System.out.println("IDS size is : "+codes.size()); + + + // Checking mapping + + HashMap> collisions=new HashMap<>(); + for(OAIRecord r:records) { + String toFix=r.getHeader().getIdentifier(); + String fixed=toFix.toLowerCase().replaceAll("[^a-z0-9_\\\\-]", "_"); + if(!collisions.containsKey(fixed)) + collisions.put(fixed, new HashSet()); + collisions.get(fixed).add(toFix); + } + + System.out.println("Got "+collisions.size()+" unique ckan ids, listing collisions"); + String maxCollided=null; + int maxCollisions=0; + + for(Entry> entry:collisions.entrySet()) { + if(entry.getValue().size()>1) + System.out.println("Collision from "+entry.getKey()+" to "+entry.getValue()); + if(entry.getValue().size()>maxCollisions) { + maxCollisions=entry.getValue().size(); + maxCollided=entry.getKey(); + } + } + + System.out.println("Top hits from "+maxCollided+" to "+collisions.get(maxCollided)); + + //Use transofrmer +// GCatTransformer tr=new GCatTransformer(); +// Set translated=tr.transform(records); +// System.out.println("Trasnlated size = "+translated.size()); +// +// HashSet ids =new HashSet(); +// for(GCatModel model : translated) +// ids.add(model.getItem().getName()); +// +// System.out.println("IDS size = "+ids.size()); } + + private static final Collection fromURL(String url,Integer maxItems) throws JAXBException, OAIInteractionException { + OAIClient client=new OAIClient(url); + client.setMaxItems(maxItems); + return client.getAll(OAIClient.DC_METADATA_PREFIX); + } + + + + private static String serialize (Collection coll) throws JAXBException, IOException { + OAI_PMH toWrite=new OAI_PMH(); + toWrite.setResponseRecords(new ListRecords(new ArrayList(coll),null)); + + + JAXBContext jaxbContext = JAXBContext.newInstance(OAIRecord.class, + MetadataHolder.class, + OAIMetadata.class, + DCRecordMetadata.class, + OAI_PMH.class); + + File outF=File.createTempFile("oai-records", ".xml"); + PrintWriter out = new PrintWriter(outF); + + Marshaller m=jaxbContext.createMarshaller(); +// for(OAIRecord r:toWrite) + m.marshal(toWrite, out); + + out.flush(); + out.close(); + return outF.getAbsolutePath(); + } + + + private static Collection read(String file) throws JAXBException{ + JAXBContext jaxbContext = JAXBContext.newInstance(OAIRecord.class, + MetadataHolder.class, + OAIMetadata.class, + DCRecordMetadata.class, + OAI_PMH.class); + + Unmarshaller u=jaxbContext.createUnmarshaller(); + OAI_PMH parsed=(OAI_PMH) u.unmarshal(new File(file)); + return parsed.getResponseRecords().getRecords(); + } } diff --git a/oai-harvester/src/test/resources/logback.xml b/oai-harvester/src/test/resources/logback.xml new file mode 100644 index 0000000..32d3ae2 --- /dev/null +++ b/oai-harvester/src/test/resources/logback.xml @@ -0,0 +1,14 @@ + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + + \ No newline at end of file diff --git a/pom.xml b/pom.xml index 59ed427..ad03e98 100644 --- a/pom.xml +++ b/pom.xml @@ -18,7 +18,7 @@ distro https://code-repo.d4science.org/gCubeSystem - 1.0.2 + 1.0.3-SNAPSHOT