From d4799492a3f3b26501cbe4fc600cf6a05106871d Mon Sep 17 00:00:00 2001 From: Fabio Sinibaldi Date: Tue, 1 Mar 2022 17:39:43 +0100 Subject: [PATCH 1/8] Implements #22889 --- CHANGELOG.md | 5 ++ pom.xml | 9 +++- .../publishing/ckan2zenodo/commons/Net.java | 33 +++++++++++-- .../ckan2zenodo/model/DownloadedFile.java | 23 ++++++++- .../ckan2zenodo}/EnvironmentCheckTests.java | 4 +- .../publishing/ckan2zenodo}/GCatTests.java | 2 +- .../ckan2zenodo}/InspectCredentials.java | 2 +- .../publishing/ckan2zenodo}/OneHitTest.java | 4 +- .../publishing/ckan2zenodo}/ParsingTests.java | 9 +--- .../publishing/ckan2zenodo}/Previewer.java | 6 +-- .../publishing/ckan2zenodo}/TestCommons.java | 4 +- .../publishing/ckan2zenodo}/TokenSetter.java | 2 +- .../ckan2zenodo}/TransformationTests.java | 4 +- .../publishing/ckan2zenodo}/ZenodoTests.java | 3 +- .../ckan2zenodo/model/NetTests.java | 49 +++++++++++++++++++ 15 files changed, 124 insertions(+), 35 deletions(-) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/EnvironmentCheckTests.java (89%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/GCatTests.java (96%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/InspectCredentials.java (92%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/OneHitTest.java (96%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/ParsingTests.java (95%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/Previewer.java (78%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/TestCommons.java (95%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/TokenSetter.java (94%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/TransformationTests.java (96%) rename src/test/java/org/gcube/{tests => data/publishing/ckan2zenodo}/ZenodoTests.java (98%) create mode 100644 src/test/java/org/gcube/data/publishing/ckan2zenodo/model/NetTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 7239c0f..b6ab80d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). # Changelog for org.gcube.data.publishing.ckan2zenodo-library +## [v1.0.3-SNAPSHOT] 2022-03-01 +- Extensions from URL [#22889](https://support.d4science.org/issues/22889) + + + ## [v1.0.2] 2021-07-30 - Introduced environemnt check [#19990](https://support.d4science.org/issues/19990) diff --git a/pom.xml b/pom.xml index b775a62..85f7d0e 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ org.gcube.data.publishing ckan2zenodo-library - 1.0.2 + 1.0.3-SNAPSHOT CKAN 2 Zenodo Library Library to publish d4science CKAN items into Zenodo @@ -59,6 +59,13 @@ 1.14.8 + + + commons-io + commons-io + 1.4.0 + + org.glassfish.jersey.media jersey-media-json-jackson diff --git a/src/main/java/org/gcube/data/publishing/ckan2zenodo/commons/Net.java b/src/main/java/org/gcube/data/publishing/ckan2zenodo/commons/Net.java index 9b0bd15..1cf5754 100644 --- a/src/main/java/org/gcube/data/publishing/ckan2zenodo/commons/Net.java +++ b/src/main/java/org/gcube/data/publishing/ckan2zenodo/commons/Net.java @@ -1,12 +1,16 @@ package org.gcube.data.publishing.ckan2zenodo.commons; import java.io.File; +import java.io.IOException; import java.io.InputStream; +import java.net.HttpURLConnection; import java.net.URL; import java.nio.file.Files; import java.nio.file.StandardCopyOption; import java.security.DigestInputStream; import java.security.MessageDigest; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.gcube.data.publishing.ckan2zenodo.model.CkanResource; import org.gcube.data.publishing.ckan2zenodo.model.DownloadedFile; @@ -16,6 +20,9 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class Net { + private static final Pattern FILENAME_IN_DEPOSITION_REGEXP = Pattern.compile("(?<=filename\\=\\\").*(?=\\\")"); + + public static DownloadedFile download(CkanResource toDownload) throws Exception { String urlString=toDownload.getUrl(); log.debug("Downloading "+urlString); @@ -28,10 +35,17 @@ public class Net { InputStream is=null; int attempt=0; Exception lastException=null; + + String remoteFileName=null; + while(is==null&&attempt<5) { try { attempt++; is=url.openStream(); + + if(remoteFileName == null) + remoteFileName = getFilenameFromURL(url); + }catch(Exception e) { lastException=e; try{ @@ -40,17 +54,30 @@ public class Net { } } if(is==null) throw new Exception("Unable to download "+urlString,lastException); - + if(remoteFileName == null) + remoteFileName = ""; // Unable to evaluate from HEAD + + DigestInputStream dis = new DigestInputStream(is, md); // Download long size=Files.copy(is, temp.toPath(),StandardCopyOption.REPLACE_EXISTING); - return new DownloadedFile(toDownload,temp,dis.getMessageDigest().toString()); + return new DownloadedFile(toDownload,temp,dis.getMessageDigest().toString(),remoteFileName); } - + + private static final String getFilenameFromURL(URL url) throws IOException { + HttpURLConnection con = (HttpURLConnection) url.openConnection(); + con.setRequestMethod("GET"); + String contentDisp= con.getHeaderField("Content-Disposition"); + + Matcher m = FILENAME_IN_DEPOSITION_REGEXP.matcher(contentDisp); + m.find(); + return m.group(0); + } + } diff --git a/src/main/java/org/gcube/data/publishing/ckan2zenodo/model/DownloadedFile.java b/src/main/java/org/gcube/data/publishing/ckan2zenodo/model/DownloadedFile.java index 200e1a3..7c572c3 100644 --- a/src/main/java/org/gcube/data/publishing/ckan2zenodo/model/DownloadedFile.java +++ b/src/main/java/org/gcube/data/publishing/ckan2zenodo/model/DownloadedFile.java @@ -16,5 +16,26 @@ public class DownloadedFile { private CkanResource source; private File f; private String MD5; - + private String remoteFileName; + + + public String getToUseFileName(){ + if (getExtension(source.getName())!=null){ + // source contains extension + return source.getName(); + }else { + String evaluatedExtension=getExtension(remoteFileName); + if(evaluatedExtension!=null) + return source.getName()+evaluatedExtension; + else return source.getName(); // No extension + } + } + + static final String getExtension(String filename){ + int lastIndexOf = filename.lastIndexOf("."); + if (lastIndexOf == -1) { + return null; // no extension + } + return filename.substring(lastIndexOf); + } } diff --git a/src/test/java/org/gcube/tests/EnvironmentCheckTests.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/EnvironmentCheckTests.java similarity index 89% rename from src/test/java/org/gcube/tests/EnvironmentCheckTests.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/EnvironmentCheckTests.java index 70f28db..ca59772 100644 --- a/src/test/java/org/gcube/tests/EnvironmentCheckTests.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/EnvironmentCheckTests.java @@ -1,7 +1,5 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; -import org.gcube.data.publishing.ckan2zenodo.Ckan2Zenodo; -import org.gcube.data.publishing.ckan2zenodo.Ckan2ZenodoImpl; import org.gcube.data.publishing.ckan2zenodo.model.faults.*; import org.gcube.data.publishing.ckan2zenodo.model.report.EnvironmentReport; import org.junit.Assume; diff --git a/src/test/java/org/gcube/tests/GCatTests.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/GCatTests.java similarity index 96% rename from src/test/java/org/gcube/tests/GCatTests.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/GCatTests.java index b76fe35..9bd6ea0 100644 --- a/src/test/java/org/gcube/tests/GCatTests.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/GCatTests.java @@ -1,4 +1,4 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; import java.net.MalformedURLException; diff --git a/src/test/java/org/gcube/tests/InspectCredentials.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/InspectCredentials.java similarity index 92% rename from src/test/java/org/gcube/tests/InspectCredentials.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/InspectCredentials.java index e721b2c..53580a3 100644 --- a/src/test/java/org/gcube/tests/InspectCredentials.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/InspectCredentials.java @@ -1,4 +1,4 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; import org.gcube.data.publishing.ckan2zenodo.clients.Zenodo; import org.gcube.data.publishing.ckan2zenodo.model.ZenodoCredentials; diff --git a/src/test/java/org/gcube/tests/OneHitTest.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/OneHitTest.java similarity index 96% rename from src/test/java/org/gcube/tests/OneHitTest.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/OneHitTest.java index c04e556..619567a 100644 --- a/src/test/java/org/gcube/tests/OneHitTest.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/OneHitTest.java @@ -1,4 +1,4 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; import java.net.MalformedURLException; import java.util.ArrayList; @@ -7,8 +7,6 @@ import java.util.List; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; -import org.gcube.data.publishing.ckan2zenodo.Ckan2Zenodo; -import org.gcube.data.publishing.ckan2zenodo.Ckan2ZenodoImpl; import org.gcube.data.publishing.ckan2zenodo.clients.GCat; import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor; import org.gcube.data.publishing.ckan2zenodo.model.CkanResource; diff --git a/src/test/java/org/gcube/tests/ParsingTests.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/ParsingTests.java similarity index 95% rename from src/test/java/org/gcube/tests/ParsingTests.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/ParsingTests.java index 628dd6c..7ec096b 100644 --- a/src/test/java/org/gcube/tests/ParsingTests.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/ParsingTests.java @@ -1,26 +1,19 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; import java.util.List; -import org.gcube.data.publishing.ckan2zenodo.Fixer; import org.gcube.data.publishing.ckan2zenodo.commons.Parsing; import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor; import org.gcube.data.publishing.ckan2zenodo.model.CkanRelatedIdentifier; import org.gcube.data.publishing.ckan2zenodo.model.faults.ConfigurationException; import org.gcube.data.publishing.ckan2zenodo.model.faults.InvalidItemException; import org.gcube.data.publishing.ckan2zenodo.model.zenodo.DepositionMetadata; -import org.gcube.data.publishing.ckan2zenodo.model.zenodo.RelatedIdentifier; import org.gcube.data.publishing.ckan2zenodo.model.zenodo.ZenodoDeposition; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; -import com.fasterxml.jackson.annotation.JsonInclude.Include; import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/src/test/java/org/gcube/tests/Previewer.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/Previewer.java similarity index 78% rename from src/test/java/org/gcube/tests/Previewer.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/Previewer.java index 87c2908..5267af6 100644 --- a/src/test/java/org/gcube/tests/Previewer.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/Previewer.java @@ -1,14 +1,10 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; -import java.io.FileNotFoundException; import java.io.FileOutputStream; -import java.io.IOException; import org.gcube.common.resources.gcore.GenericResource; import org.gcube.common.resources.gcore.Resources; -import org.gcube.data.publishing.ckan2zenodo.Translator; import org.gcube.data.publishing.ckan2zenodo.commons.IS; -import org.gcube.data.publishing.ckan2zenodo.model.faults.ConfigurationException; import org.gcube.data.publishing.ckan2zenodo.model.parsing.Mappings; import org.junit.Test; diff --git a/src/test/java/org/gcube/tests/TestCommons.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/TestCommons.java similarity index 95% rename from src/test/java/org/gcube/tests/TestCommons.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/TestCommons.java index 065bedf..0b0fa8e 100644 --- a/src/test/java/org/gcube/tests/TestCommons.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/TestCommons.java @@ -1,4 +1,4 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; import java.io.IOException; import java.io.PrintStream; @@ -7,8 +7,6 @@ import java.nio.file.Paths; import java.util.HashMap; import java.util.Map; -import org.gcube.data.publishing.ckan2zenodo.Fixer; -import org.gcube.data.publishing.ckan2zenodo.Translator; import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor; import org.gcube.data.publishing.ckan2zenodo.model.CkanResource; import org.gcube.data.publishing.ckan2zenodo.model.ZenodoCredentials; diff --git a/src/test/java/org/gcube/tests/TokenSetter.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/TokenSetter.java similarity index 94% rename from src/test/java/org/gcube/tests/TokenSetter.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/TokenSetter.java index a6180d5..e59d40b 100644 --- a/src/test/java/org/gcube/tests/TokenSetter.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/TokenSetter.java @@ -1,4 +1,4 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; import java.util.Properties; diff --git a/src/test/java/org/gcube/tests/TransformationTests.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/TransformationTests.java similarity index 96% rename from src/test/java/org/gcube/tests/TransformationTests.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/TransformationTests.java index 271bd7b..8629bdd 100644 --- a/src/test/java/org/gcube/tests/TransformationTests.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/TransformationTests.java @@ -1,4 +1,4 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; import static org.junit.Assert.assertTrue; @@ -9,7 +9,6 @@ import java.util.Map.Entry; import org.gcube.common.resources.gcore.GenericResource; import org.gcube.common.resources.gcore.Resources; -import org.gcube.data.publishing.ckan2zenodo.Translator; import org.gcube.data.publishing.ckan2zenodo.commons.IS; import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor; import org.gcube.data.publishing.ckan2zenodo.model.CkanResource; @@ -17,7 +16,6 @@ import org.gcube.data.publishing.ckan2zenodo.model.faults.ConfigurationException import org.gcube.data.publishing.ckan2zenodo.model.parsing.Filter; import org.gcube.data.publishing.ckan2zenodo.model.parsing.Mapping; import org.gcube.data.publishing.ckan2zenodo.model.parsing.Mappings; -import org.gcube.data.publishing.ckan2zenodo.model.parsing.Regexp; import org.junit.BeforeClass; import org.junit.Test; diff --git a/src/test/java/org/gcube/tests/ZenodoTests.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/ZenodoTests.java similarity index 98% rename from src/test/java/org/gcube/tests/ZenodoTests.java rename to src/test/java/org/gcube/data/publishing/ckan2zenodo/ZenodoTests.java index b822051..b8aa8e2 100644 --- a/src/test/java/org/gcube/tests/ZenodoTests.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/ZenodoTests.java @@ -1,4 +1,4 @@ -package org.gcube.tests; +package org.gcube.data.publishing.ckan2zenodo; import java.io.IOException; import java.util.Arrays; @@ -8,7 +8,6 @@ import java.util.Map.Entry; import org.gcube.common.resources.gcore.GenericResource; import org.gcube.common.resources.gcore.Resources; -import org.gcube.data.publishing.ckan2zenodo.Translator; import org.gcube.data.publishing.ckan2zenodo.clients.Zenodo; import org.gcube.data.publishing.ckan2zenodo.commons.IS; import org.gcube.data.publishing.ckan2zenodo.commons.Net; diff --git a/src/test/java/org/gcube/data/publishing/ckan2zenodo/model/NetTests.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/model/NetTests.java new file mode 100644 index 0000000..64614e6 --- /dev/null +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/model/NetTests.java @@ -0,0 +1,49 @@ +package org.gcube.data.publishing.ckan2zenodo.model; + +import org.gcube.data.publishing.ckan2zenodo.commons.Net; +import org.junit.Test; + +import static junit.framework.TestCase.assertTrue; + +public class NetTests { + + @Test + public void testfileNames() throws Exception { + + CkanResource res=new CkanResource(); + res.setName("Deliverable"); + res.setDescription("My description"); + res.setId("resource_id"); + + // PDF URL + res.setUrl("https://data-pre.d4science.net/RgA7"); + check(Net.download(res),"Deliverable.pdf",true); + + // Do not use HEAD if extension in resource name + res.setName("Deliverable.rtf"); + check(Net.download(res),res.getName(),true); + + //Check invalid urls i.e. folder url == UNABLE TO GET FILENAME FROM HEAD + res.setUrl("http://data-pre.d4science.org/workspace-explorer-app?folderId=UjV1MTJ4K2lvQU5MRE1MT2NCOEVGWDkvMG5SL2dwY3A0QmpWZmdRVEFxR3Njd2cwcUxUQ3BBZzZxa1FhN3JQTQ"); + // Still should use resource name + check(Net.download(res),res.getName(),true); + + //Check invalid urls i.e. folder url == UNABLE TO GET FILENAME FROM HEAD + res.setName("Deliverable"); + //Shouldn't have extension + check(Net.download(res),res.getName(),false); + } + + + private static final void check(DownloadedFile f, String expectedFilename, boolean expectExtension){ + System.out.println(f); + System.out.println("Resulting filename is : "+f.getToUseFileName()); + assertTrue(f.getRemoteFileName()!=null); + assertTrue(f.getToUseFileName()!=null); + if(expectExtension) + assertTrue(DownloadedFile.getExtension(f.getToUseFileName())!=null); + else assertTrue(DownloadedFile.getExtension(f.getToUseFileName())==null); + assertTrue(f.getToUseFileName().equals(expectedFilename)); + } + +} From 7ce2c222e361c8c75a8999b35fa02c3f8a0eba14 Mon Sep 17 00:00:00 2001 From: Fabio Sinibaldi Date: Tue, 1 Mar 2022 17:49:07 +0100 Subject: [PATCH 2/8] Removed unused dependency --- pom.xml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pom.xml b/pom.xml index 85f7d0e..8993a6c 100644 --- a/pom.xml +++ b/pom.xml @@ -59,12 +59,6 @@ 1.14.8 - - - commons-io - commons-io - 1.4.0 - org.glassfish.jersey.media From 90466b296c3eb395b5d20158d8012b87daa2bf4a Mon Sep 17 00:00:00 2001 From: Fabio Sinibaldi Date: Wed, 2 Mar 2022 11:46:23 +0100 Subject: [PATCH 3/8] Moved net logic to Downloaded file --- .../ckan2zenodo/clients/UploadFilesCall.java | 72 ++++----- .../publishing/ckan2zenodo/commons/Net.java | 83 ---------- .../ckan2zenodo/model/DownloadedFile.java | 150 ++++++++++++++++-- .../publishing/ckan2zenodo/ZenodoTests.java | 9 +- .../ckan2zenodo/model/NetTests.java | 13 +- 5 files changed, 179 insertions(+), 148 deletions(-) delete mode 100644 src/main/java/org/gcube/data/publishing/ckan2zenodo/commons/Net.java diff --git a/src/main/java/org/gcube/data/publishing/ckan2zenodo/clients/UploadFilesCall.java b/src/main/java/org/gcube/data/publishing/ckan2zenodo/clients/UploadFilesCall.java index 31eb9a9..18d92c3 100644 --- a/src/main/java/org/gcube/data/publishing/ckan2zenodo/clients/UploadFilesCall.java +++ b/src/main/java/org/gcube/data/publishing/ckan2zenodo/clients/UploadFilesCall.java @@ -1,10 +1,9 @@ package org.gcube.data.publishing.ckan2zenodo.clients; import java.nio.file.Files; -import java.util.Collection; +import java.util.*; import java.util.concurrent.Callable; -import org.gcube.data.publishing.ckan2zenodo.commons.Net; import org.gcube.data.publishing.ckan2zenodo.model.CkanResource; import org.gcube.data.publishing.ckan2zenodo.model.DownloadedFile; import org.gcube.data.publishing.ckan2zenodo.model.zenodo.FileDeposition; @@ -40,64 +39,55 @@ public class UploadFilesCall implements Callable{ log.debug("Removing not referenced files.."); + HashMap resourceMap=new HashMap<>(); + for(CkanResource r:toUpload) { + DownloadedFile d=new DownloadedFile(r); + resourceMap.put(d.getToUseFileName(),d); + } + Set alreadyExistingFiles=new HashSet<>(); + for(FileDeposition f:dep.getFiles()) { CkanResource found=null; - for(CkanResource r:toUpload) - if(r.getName().equals(f.getFilename())) { - found=r; - break; - } - if(found==null) // File not present in current toUpload set + if(resourceMap.containsKey(f.getFilename())){ + alreadyExistingFiles.add(f.getFilename()); try{ + // check for update + DownloadedFile downloaded = resourceMap.get(f.getFilename()); + if(!downloaded.getMD5().equals(f.getChecksum())) { + log.debug("MD5 differ, going to update : "+downloaded+" - "+f); + z.deleteFile(dep, f); + z.uploadFile(dep, found.getName(), downloaded.getFile()); + } + }catch (Throwable t){ + log.warn("Unable to update "+f,t); + } + }else { + try{ + // remove File not present in current toUpload set log.debug("Remote file "+f+" is not in requested set. Deleting it.."); z.deleteFile(dep, f); }catch(Throwable t) { log.warn("Unable to delete "+f,t); } - else { - // File present, checking for update - DownloadedFile downloaded=null; - try { - log.debug("Found already existing remote file "+f); - downloaded=Net.download(found); - if(!downloaded.getMD5().equals(f.getChecksum())) { - log.debug("MD5 differ, going to update : "+downloaded+" - "+f); - z.deleteFile(dep, f); - z.uploadFile(dep, found.getName(), downloaded.getF()); - } - }catch(Throwable t) { - log.warn("Unable to update "+f,t); - }finally { - if(downloaded!=null) Files.deleteIfExists(downloaded.getF().toPath()); - } } } log.debug("Going to push additional resources for "+deposition.getTitle()+" ID : "+deposition.getId()); - for(CkanResource r:toUpload) { - DownloadedFile downloaded=null; - try { - boolean found=false; - - for(FileDeposition f:dep.getFiles()) - if(f.getFilename().equals(r.getName())) { - found=true; - break; - } - if(!found) { - downloaded=Net.download(r); - z.uploadFile(dep, r.getName(),downloaded.getF()); - } + for(Map.Entry e : resourceMap.entrySet()){ + DownloadedFile downloadedFile=e.getValue(); + if(!alreadyExistingFiles.contains(e.getKey())) + try{ + // Upload new file + z.uploadFile(dep,downloadedFile.getToUseFileName(),downloadedFile.getFile()); }catch(Throwable t) { - log.warn("Unable to upload "+r.getName()+".",t); + log.warn("Unable to upload "+downloadedFile.getSource().getName(),t); } } + return z.readDeposition(dep.getId()); } - - } diff --git a/src/main/java/org/gcube/data/publishing/ckan2zenodo/commons/Net.java b/src/main/java/org/gcube/data/publishing/ckan2zenodo/commons/Net.java deleted file mode 100644 index 1cf5754..0000000 --- a/src/main/java/org/gcube/data/publishing/ckan2zenodo/commons/Net.java +++ /dev/null @@ -1,83 +0,0 @@ -package org.gcube.data.publishing.ckan2zenodo.commons; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.net.HttpURLConnection; -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.StandardCopyOption; -import java.security.DigestInputStream; -import java.security.MessageDigest; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.gcube.data.publishing.ckan2zenodo.model.CkanResource; -import org.gcube.data.publishing.ckan2zenodo.model.DownloadedFile; - -import lombok.extern.slf4j.Slf4j; - -@Slf4j -public class Net { - - private static final Pattern FILENAME_IN_DEPOSITION_REGEXP = Pattern.compile("(?<=filename\\=\\\").*(?=\\\")"); - - - public static DownloadedFile download(CkanResource toDownload) throws Exception { - String urlString=toDownload.getUrl(); - log.debug("Downloading "+urlString); - //Download locally into temp - URL url=new URL(urlString); - File temp=File.createTempFile("zenodo_", ".tmp"); - MessageDigest md = MessageDigest.getInstance("MD5"); - - // Multiple tries - InputStream is=null; - int attempt=0; - Exception lastException=null; - - String remoteFileName=null; - - while(is==null&&attempt<5) { - try { - attempt++; - is=url.openStream(); - - if(remoteFileName == null) - remoteFileName = getFilenameFromURL(url); - - }catch(Exception e) { - lastException=e; - try{ - Thread.sleep(500*attempt); - }catch(InterruptedException e1) {} - } - } - if(is==null) throw new Exception("Unable to download "+urlString,lastException); - if(remoteFileName == null) - remoteFileName = ""; // Unable to evaluate from HEAD - - - DigestInputStream dis = new DigestInputStream(is, md); - - - // Download - long size=Files.copy(is, temp.toPath(),StandardCopyOption.REPLACE_EXISTING); - - return new DownloadedFile(toDownload,temp,dis.getMessageDigest().toString(),remoteFileName); - - - } - - - private static final String getFilenameFromURL(URL url) throws IOException { - HttpURLConnection con = (HttpURLConnection) url.openConnection(); - con.setRequestMethod("GET"); - String contentDisp= con.getHeaderField("Content-Disposition"); - - Matcher m = FILENAME_IN_DEPOSITION_REGEXP.matcher(contentDisp); - m.find(); - return m.group(0); - } - -} diff --git a/src/main/java/org/gcube/data/publishing/ckan2zenodo/model/DownloadedFile.java b/src/main/java/org/gcube/data/publishing/ckan2zenodo/model/DownloadedFile.java index 7c572c3..fe5c8c3 100644 --- a/src/main/java/org/gcube/data/publishing/ckan2zenodo/model/DownloadedFile.java +++ b/src/main/java/org/gcube/data/publishing/ckan2zenodo/model/DownloadedFile.java @@ -1,29 +1,55 @@ package org.gcube.data.publishing.ckan2zenodo.model; import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; +import java.security.DigestInputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.Setter; -import lombok.ToString; +import lombok.*; +import lombok.extern.slf4j.Slf4j; -@Getter -@Setter -@AllArgsConstructor -@ToString + +@RequiredArgsConstructor +@Slf4j public class DownloadedFile { + private static final Pattern FILENAME_IN_DEPOSITION_REGEXP = Pattern.compile("(?<=filename\\=\\\").*(?=\\\")"); + + @NonNull + @Getter private CkanResource source; - private File f; - private String MD5; - private String remoteFileName; + //private File f; + private File f=null; + private String MD5=null; + private String remoteFileName=null; - public String getToUseFileName(){ + @Override + public String toString() { + return "DownloadedFile{" + + "source=" + source + + ", f=" + f + + ", MD5='" + MD5 + '\'' + + ", remoteFileName='" + remoteFileName + '\'' + + '}'; + } + + public String getToUseFileName() throws Exception { if (getExtension(source.getName())!=null){ // source contains extension return source.getName(); }else { + if(remoteFileName == null){ + initRemoteFileName(); + } String evaluatedExtension=getExtension(remoteFileName); if(evaluatedExtension!=null) return source.getName()+evaluatedExtension; @@ -31,6 +57,26 @@ public class DownloadedFile { } } + + + public File getFile() throws Exception { + if(f==null){ + download(); + } + return f; + } + + public String getMD5() throws Exception { + if(MD5==null){ + download(); + } + return MD5; + } + + + + + static final String getExtension(String filename){ int lastIndexOf = filename.lastIndexOf("."); if (lastIndexOf == -1) { @@ -38,4 +84,84 @@ public class DownloadedFile { } return filename.substring(lastIndexOf); } + + + /** + * Actually downlaods the file INIT File, MD5 and remoteFilename. + */ + private void download() throws Exception { + log.info("Downloading {} from {}",source.getName(),source.getUrl()); + URL url=new URL(source.getUrl()); + f=File.createTempFile("zenodo_", ".tmp"); + MessageDigest md = MessageDigest.getInstance("MD5"); + + // Multiple tries + InputStream is=null; + int attempt=0; + Exception lastException=null; + + + while(is==null&&attempt<5) { + try { + attempt++; + is=url.openStream(); + + if(remoteFileName == null) + remoteFileName = getFilenameFromURL(url); + + }catch(Exception e) { + lastException=e; + try{ + Thread.sleep(500*attempt); + }catch(InterruptedException e1) {} + } + } + if(is==null) throw new Exception("Unable to download "+source.getUrl(),lastException); + + if(remoteFileName == null) + remoteFileName = ""; // Unable to evaluate from HEAD + + + DigestInputStream dis = new DigestInputStream(is, md); + MD5 = dis.getMessageDigest().toString(); + + // Download + long size= Files.copy(is, f.toPath(), StandardCopyOption.REPLACE_EXISTING); + log.info("Received {} bytes for {} ",size,source.getName()); + } + + /** + * Performs HTTP HEAD and INIT remoteFileName + */ + private void initRemoteFileName() throws Exception { + int attempt =0; + Exception lastException=null; + URL url=new URL(source.getUrl()); + while(remoteFileName==null&&attempt<5) { + try { + attempt++; + remoteFileName = getFilenameFromURL(url); + }catch(Exception e) { + lastException=e; + try{ + Thread.sleep(500*attempt); + }catch(InterruptedException e1) {} + } + } + if(remoteFileName == null) { + remoteFileName = ""; // Unable to evaluate from HEAD + log.warn("Unable to get remote file name from {} [resource Name {}]",source.getUrl(),source.getName(),lastException); + } + } + + + private static final String getFilenameFromURL(URL url) throws IOException { + HttpURLConnection con = (HttpURLConnection) url.openConnection(); + con.setRequestMethod("GET"); + String contentDisp= con.getHeaderField("Content-Disposition"); + + Matcher m = FILENAME_IN_DEPOSITION_REGEXP.matcher(contentDisp); + m.find(); + return m.group(0); + } } diff --git a/src/test/java/org/gcube/data/publishing/ckan2zenodo/ZenodoTests.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/ZenodoTests.java index b8aa8e2..49c3122 100644 --- a/src/test/java/org/gcube/data/publishing/ckan2zenodo/ZenodoTests.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/ZenodoTests.java @@ -10,7 +10,6 @@ import org.gcube.common.resources.gcore.GenericResource; import org.gcube.common.resources.gcore.Resources; import org.gcube.data.publishing.ckan2zenodo.clients.Zenodo; import org.gcube.data.publishing.ckan2zenodo.commons.IS; -import org.gcube.data.publishing.ckan2zenodo.commons.Net; import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor; import org.gcube.data.publishing.ckan2zenodo.model.CkanRelatedIdentifier; import org.gcube.data.publishing.ckan2zenodo.model.CkanResource; @@ -88,8 +87,8 @@ public class ZenodoTests { CkanItemDescriptor desc=new CkanItemDescriptor(json); for(CkanResource cRes:tran.filterResources(desc)) { - DownloadedFile f=Net.download(cRes); - FileDeposition file=z.uploadFile(dep, cRes.getName(), f.getF()); + DownloadedFile f=new DownloadedFile(cRes); + FileDeposition file=z.uploadFile(dep, cRes.getName(), f.getFile()); System.out.println("Published "+file); } @@ -116,9 +115,9 @@ public class ZenodoTests { CkanItemDescriptor desc=new CkanItemDescriptor(json); for(CkanResource cRes:tran.filterResources(desc)) { - DownloadedFile f=Net.download(cRes); + DownloadedFile f=new DownloadedFile(cRes); - FileDeposition file=z.uploadFile(dep, cRes.getName(), f.getF()); + FileDeposition file=z.uploadFile(dep, cRes.getName(), f.getFile()); System.out.println("Published "+file); } dep=z.publish(dep); diff --git a/src/test/java/org/gcube/data/publishing/ckan2zenodo/model/NetTests.java b/src/test/java/org/gcube/data/publishing/ckan2zenodo/model/NetTests.java index 64614e6..02ed34f 100644 --- a/src/test/java/org/gcube/data/publishing/ckan2zenodo/model/NetTests.java +++ b/src/test/java/org/gcube/data/publishing/ckan2zenodo/model/NetTests.java @@ -1,6 +1,5 @@ package org.gcube.data.publishing.ckan2zenodo.model; -import org.gcube.data.publishing.ckan2zenodo.commons.Net; import org.junit.Test; import static junit.framework.TestCase.assertTrue; @@ -17,28 +16,28 @@ public class NetTests { // PDF URL res.setUrl("https://data-pre.d4science.net/RgA7"); - check(Net.download(res),"Deliverable.pdf",true); + check(new DownloadedFile(res),"Deliverable.pdf",true); + // Do not use HEAD if extension in resource name res.setName("Deliverable.rtf"); - check(Net.download(res),res.getName(),true); + check(new DownloadedFile(res),res.getName(),true); //Check invalid urls i.e. folder url == UNABLE TO GET FILENAME FROM HEAD res.setUrl("http://data-pre.d4science.org/workspace-explorer-app?folderId=UjV1MTJ4K2lvQU5MRE1MT2NCOEVGWDkvMG5SL2dwY3A0QmpWZmdRVEFxR3Njd2cwcUxUQ3BBZzZxa1FhN3JQTQ"); // Still should use resource name - check(Net.download(res),res.getName(),true); + check(new DownloadedFile(res),res.getName(),true); //Check invalid urls i.e. folder url == UNABLE TO GET FILENAME FROM HEAD res.setName("Deliverable"); //Shouldn't have extension - check(Net.download(res),res.getName(),false); + check(new DownloadedFile(res),res.getName(),false); } - private static final void check(DownloadedFile f, String expectedFilename, boolean expectExtension){ + private static final void check(DownloadedFile f, String expectedFilename, boolean expectExtension) throws Exception { System.out.println(f); System.out.println("Resulting filename is : "+f.getToUseFileName()); - assertTrue(f.getRemoteFileName()!=null); assertTrue(f.getToUseFileName()!=null); if(expectExtension) assertTrue(DownloadedFile.getExtension(f.getToUseFileName())!=null); From 70d7802cbeda826ee4a1c1327424da81aa9dc785 Mon Sep 17 00:00:00 2001 From: Fabio Sinibaldi Date: Tue, 28 Mar 2023 12:31:36 +0200 Subject: [PATCH 4/8] Updated bom --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8993a6c..ffda321 100644 --- a/pom.xml +++ b/pom.xml @@ -31,7 +31,7 @@ org.gcube.distribution gcube-bom - 2.0.1 + 2.1.0 pom import From 5b969ac2cbf840a47dba625d4722542af88fa2eb Mon Sep 17 00:00:00 2001 From: Fabio Sinibaldi Date: Tue, 28 Mar 2023 12:31:54 +0200 Subject: [PATCH 5/8] Added log --- .../org/gcube/data/publishing/ckan2zenodo/clients/Zenodo.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/gcube/data/publishing/ckan2zenodo/clients/Zenodo.java b/src/main/java/org/gcube/data/publishing/ckan2zenodo/clients/Zenodo.java index 263dd6c..7c91d4b 100644 --- a/src/main/java/org/gcube/data/publishing/ckan2zenodo/clients/Zenodo.java +++ b/src/main/java/org/gcube/data/publishing/ckan2zenodo/clients/Zenodo.java @@ -129,8 +129,9 @@ public class Zenodo { } public FileDeposition uploadFile(ZenodoDeposition deposition, String toUploadName,File toUpload) throws ZenodoException { + final ZenodoDeposition dep=(deposition.getSubmitted())?newVersion(deposition.getId()):deposition; - + log.info("Pushing File {} to Deposition {}",toUploadName,dep); Callable call=new Callable() { @Override From 306c6723b801c1e5bc40e078c15655ceed788f30 Mon Sep 17 00:00:00 2001 From: Fabio Sinibaldi Date: Tue, 28 Mar 2023 12:32:15 +0200 Subject: [PATCH 6/8] Removed exception for missing configurations --- .../gcube/data/publishing/ckan2zenodo/TransformerManager.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/org/gcube/data/publishing/ckan2zenodo/TransformerManager.java b/src/main/java/org/gcube/data/publishing/ckan2zenodo/TransformerManager.java index de04461..4d51b2a 100644 --- a/src/main/java/org/gcube/data/publishing/ckan2zenodo/TransformerManager.java +++ b/src/main/java/org/gcube/data/publishing/ckan2zenodo/TransformerManager.java @@ -35,8 +35,7 @@ public class TransformerManager { else return new Translator(m); } } - throw new ConfigurationException("No specific mapping for the catalogue item has been configured. " - + "By continuing with the upload some metadata might not be upload to Zenodo."); + return new Translator(); } From f60c3b649cc5c751ff9920860ff5afcb76cc4eb3 Mon Sep 17 00:00:00 2001 From: Fabio Sinibaldi Date: Tue, 28 Mar 2023 12:33:05 +0200 Subject: [PATCH 7/8] Release --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index ffda321..ac80246 100644 --- a/pom.xml +++ b/pom.xml @@ -8,7 +8,7 @@ org.gcube.data.publishing ckan2zenodo-library - 1.0.3-SNAPSHOT + 1.0.3 CKAN 2 Zenodo Library Library to publish d4science CKAN items into Zenodo From de4c509719be607ac6f57c82e2ed1f500c0566fb Mon Sep 17 00:00:00 2001 From: Fabio Sinibaldi Date: Tue, 28 Mar 2023 12:34:47 +0200 Subject: [PATCH 8/8] Release --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6ab80d..17b2013 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). # Changelog for org.gcube.data.publishing.ckan2zenodo-library -## [v1.0.3-SNAPSHOT] 2022-03-01 -- Extensions from URL [#22889](https://support.d4science.org/issues/22889) +## [v1.0.3] 2023-03-28 +- Extensions evaluated from URL [#22889](https://support.d4science.org/issues/22889)