diff --git a/oai-harvester/pom.xml b/oai-harvester/pom.xml
index c2ac272..bf4428c 100644
--- a/oai-harvester/pom.xml
+++ b/oai-harvester/pom.xml
@@ -3,7 +3,7 @@
org.gcube.data-publishing.gCat-Feeder
gCat-Feeder-Suite
- ${gFeedSuiteVersion}
+ ${gFeedSuiteVersion}
oai-harvester
oai-harvester
diff --git a/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java b/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java
index 29193fd..f87c6a7 100644
--- a/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java
+++ b/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/OAIClient.java
@@ -8,12 +8,9 @@ import javax.ws.rs.client.Client;
import javax.ws.rs.client.ClientBuilder;
import javax.ws.rs.client.WebTarget;
import javax.ws.rs.core.Response;
-import javax.xml.bind.JAXB;
import javax.xml.bind.JAXBContext;
-import javax.xml.bind.JAXBElement;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
-import javax.xml.transform.stream.StreamSource;
import org.gcube.data.publishing.gFeed.collectors.oai.model.CommunicationException;
import org.gcube.data.publishing.gFeed.collectors.oai.model.DCRecordMetadata;
@@ -27,6 +24,7 @@ import org.glassfish.jersey.client.ClientProperties;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
+import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@@ -55,7 +53,11 @@ public class OAIClient {
@NonNull
private String baseUrl;
-
+ @NonNull
+ @Setter
+ private Integer maxItems=-1;
+
+
Client client;
private synchronized Client getWebClient() {
@@ -105,6 +107,16 @@ public class OAIClient {
resumptionToken=t.getId();
}else isComplete=true; //no token = completion
+
+
+ //Using limit
+ if(maxItems>0 && toReturn.size()>=maxItems) {
+ log.warn("MAX ITEMS LIMIT REACHED : "+toReturn.size()+" / "+maxItems);
+ isComplete=true;
+ }
+
+
+
// }catch(CommunicationException e) {
// log.warn("Received communication error "+e.getMessage());
// log.debug("Current attempt number = "+currentAttempt," max attempt Number = "+MAX_ATTEMPTS+", attempts delay factor = ");
diff --git a/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/model/ckan/GCatTransformer.java b/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/model/ckan/GCatTransformer.java
index b8319c4..6b78300 100644
--- a/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/model/ckan/GCatTransformer.java
+++ b/oai-harvester/src/main/java/org/gcube/data/publishing/gFeed/collectors/oai/model/ckan/GCatTransformer.java
@@ -64,7 +64,7 @@ public class GCatTransformer implements DataTransformer{
CkanItem item=new CkanItem();
//escaping name chars
String toSetName=toTranslate.getHeader().getIdentifier();
- toSetName=toSetName.replaceAll("[^a-z0-9_\\\\-]", "_");
+ toSetName=toSetName.toLowerCase().replaceAll("[^a-z0-9_\\\\-]", "_");
item.setName(toSetName);
OAIMetadata meta=toTranslate.getMetadata().getMetadata();
if(meta instanceof DCRecordMetadata) {
diff --git a/oai-harvester/src/test/java/org/gcube/application/gfeed/oai/OAIClientTests.java b/oai-harvester/src/test/java/org/gcube/application/gfeed/oai/OAIClientTests.java
index 0a084df..7dc9e4a 100644
--- a/oai-harvester/src/test/java/org/gcube/application/gfeed/oai/OAIClientTests.java
+++ b/oai-harvester/src/test/java/org/gcube/application/gfeed/oai/OAIClientTests.java
@@ -1,29 +1,146 @@
package org.gcube.application.gfeed.oai;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map.Entry;
+import java.util.Set;
+import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
+import javax.xml.bind.Marshaller;
+import javax.xml.bind.Unmarshaller;
import org.gcube.data.publishing.gFeed.collectors.oai.OAIClient;
+import org.gcube.data.publishing.gFeed.collectors.oai.model.DCRecordMetadata;
+import org.gcube.data.publishing.gFeed.collectors.oai.model.MetadataHolder;
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIInteractionException;
+import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIMetadata;
import org.gcube.data.publishing.gFeed.collectors.oai.model.OAIRecord;
-import org.gcube.data.publishing.gFeed.collectors.oai.model.ckan.GCatModel;
-import org.gcube.data.publishing.gFeed.collectors.oai.model.ckan.GCatTransformer;
+import org.gcube.data.publishing.gFeed.collectors.oai.model.OAI_PMH;
+import org.gcube.data.publishing.gFeed.collectors.oai.model.OAI_PMH.ListRecords;
public class OAIClientTests {
- public static void main (String[] args) throws JAXBException, OAIInteractionException {
+ public static void main (String[] args) throws JAXBException, OAIInteractionException, IOException {
+
+ Collection records;
+
+ // FROM URL
String baseUrl="https://data.inrae.fr/oai";
- OAIClient client=new OAIClient(baseUrl);
- Collection records=client.getAll(OAIClient.DC_METADATA_PREFIX);
+// String baseUrl="http://researchdata.cab.unipd.it/cgi/oai2";
+ Integer maxItems=-1;
+ records=fromURL(baseUrl,maxItems);
+ System.out.println("Serialized to : "+serialize(records));
+
+
+ //FROM PATH
+// String path="/var/folders/21/65t9t22j3l3fk7s18m0t77rh0000gp/T/oai-records7470133448784207387.xml";
+// records=read(path);
+
System.out.println("Records size = "+records.size());
+ //Returned IDS check
+ System.out.println("Checking IDS ..");
+ HashSet codes=new HashSet();
+ for(OAIRecord r:records) {
+ String code=r.getHeader().getIdentifier();
+ if(codes.contains(code)) {
+ System.out.println("Found duplicate id : "+code);
+ }
+ codes.add(code);
+ }
- GCatTransformer tr=new GCatTransformer();
- tr.transform(records);
+ System.out.println("IDS size is : "+codes.size());
+
+
+ // Checking mapping
+
+ HashMap> collisions=new HashMap<>();
+ for(OAIRecord r:records) {
+ String toFix=r.getHeader().getIdentifier();
+ String fixed=toFix.toLowerCase().replaceAll("[^a-z0-9_\\\\-]", "_");
+ if(!collisions.containsKey(fixed))
+ collisions.put(fixed, new HashSet());
+ collisions.get(fixed).add(toFix);
+ }
+
+ System.out.println("Got "+collisions.size()+" unique ckan ids, listing collisions");
+ String maxCollided=null;
+ int maxCollisions=0;
+
+ for(Entry> entry:collisions.entrySet()) {
+ if(entry.getValue().size()>1)
+ System.out.println("Collision from "+entry.getKey()+" to "+entry.getValue());
+ if(entry.getValue().size()>maxCollisions) {
+ maxCollisions=entry.getValue().size();
+ maxCollided=entry.getKey();
+ }
+ }
+
+ System.out.println("Top hits from "+maxCollided+" to "+collisions.get(maxCollided));
+
+ //Use transofrmer
+// GCatTransformer tr=new GCatTransformer();
+// Set translated=tr.transform(records);
+// System.out.println("Trasnlated size = "+translated.size());
+//
+// HashSet ids =new HashSet();
+// for(GCatModel model : translated)
+// ids.add(model.getItem().getName());
+//
+// System.out.println("IDS size = "+ids.size());
}
+
+ private static final Collection fromURL(String url,Integer maxItems) throws JAXBException, OAIInteractionException {
+ OAIClient client=new OAIClient(url);
+ client.setMaxItems(maxItems);
+ return client.getAll(OAIClient.DC_METADATA_PREFIX);
+ }
+
+
+
+ private static String serialize (Collection coll) throws JAXBException, IOException {
+ OAI_PMH toWrite=new OAI_PMH();
+ toWrite.setResponseRecords(new ListRecords(new ArrayList(coll),null));
+
+
+ JAXBContext jaxbContext = JAXBContext.newInstance(OAIRecord.class,
+ MetadataHolder.class,
+ OAIMetadata.class,
+ DCRecordMetadata.class,
+ OAI_PMH.class);
+
+ File outF=File.createTempFile("oai-records", ".xml");
+ PrintWriter out = new PrintWriter(outF);
+
+ Marshaller m=jaxbContext.createMarshaller();
+// for(OAIRecord r:toWrite)
+ m.marshal(toWrite, out);
+
+ out.flush();
+ out.close();
+ return outF.getAbsolutePath();
+ }
+
+
+ private static Collection read(String file) throws JAXBException{
+ JAXBContext jaxbContext = JAXBContext.newInstance(OAIRecord.class,
+ MetadataHolder.class,
+ OAIMetadata.class,
+ DCRecordMetadata.class,
+ OAI_PMH.class);
+
+ Unmarshaller u=jaxbContext.createUnmarshaller();
+ OAI_PMH parsed=(OAI_PMH) u.unmarshal(new File(file));
+ return parsed.getResponseRecords().getRecords();
+ }
}
diff --git a/oai-harvester/src/test/resources/logback.xml b/oai-harvester/src/test/resources/logback.xml
new file mode 100644
index 0000000..32d3ae2
--- /dev/null
+++ b/oai-harvester/src/test/resources/logback.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+ %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 59ed427..ad03e98 100644
--- a/pom.xml
+++ b/pom.xml
@@ -18,7 +18,7 @@
distro
https://code-repo.d4science.org/gCubeSystem
- 1.0.2
+ 1.0.3-SNAPSHOT