Moved net logic to Downloaded file

This commit is contained in:
Fabio Sinibaldi 2022-03-02 11:46:23 +01:00
parent 7ce2c222e3
commit 90466b296c
5 changed files with 179 additions and 148 deletions

View File

@ -1,10 +1,9 @@
package org.gcube.data.publishing.ckan2zenodo.clients;
import java.nio.file.Files;
import java.util.Collection;
import java.util.*;
import java.util.concurrent.Callable;
import org.gcube.data.publishing.ckan2zenodo.commons.Net;
import org.gcube.data.publishing.ckan2zenodo.model.CkanResource;
import org.gcube.data.publishing.ckan2zenodo.model.DownloadedFile;
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.FileDeposition;
@ -40,64 +39,55 @@ public class UploadFilesCall implements Callable<ZenodoDeposition>{
log.debug("Removing not referenced files..");
HashMap<String,DownloadedFile> resourceMap=new HashMap<>();
for(CkanResource r:toUpload) {
DownloadedFile d=new DownloadedFile(r);
resourceMap.put(d.getToUseFileName(),d);
}
Set<String> alreadyExistingFiles=new HashSet<>();
for(FileDeposition f:dep.getFiles()) {
CkanResource found=null;
for(CkanResource r:toUpload)
if(r.getName().equals(f.getFilename())) {
found=r;
break;
}
if(found==null) // File not present in current toUpload set
if(resourceMap.containsKey(f.getFilename())){
alreadyExistingFiles.add(f.getFilename());
try{
// check for update
DownloadedFile downloaded = resourceMap.get(f.getFilename());
if(!downloaded.getMD5().equals(f.getChecksum())) {
log.debug("MD5 differ, going to update : "+downloaded+" - "+f);
z.deleteFile(dep, f);
z.uploadFile(dep, found.getName(), downloaded.getFile());
}
}catch (Throwable t){
log.warn("Unable to update "+f,t);
}
}else {
try{
// remove File not present in current toUpload set
log.debug("Remote file "+f+" is not in requested set. Deleting it..");
z.deleteFile(dep, f);
}catch(Throwable t) {
log.warn("Unable to delete "+f,t);
}
else {
// File present, checking for update
DownloadedFile downloaded=null;
try {
log.debug("Found already existing remote file "+f);
downloaded=Net.download(found);
if(!downloaded.getMD5().equals(f.getChecksum())) {
log.debug("MD5 differ, going to update : "+downloaded+" - "+f);
z.deleteFile(dep, f);
z.uploadFile(dep, found.getName(), downloaded.getF());
}
}catch(Throwable t) {
log.warn("Unable to update "+f,t);
}finally {
if(downloaded!=null) Files.deleteIfExists(downloaded.getF().toPath());
}
}
}
log.debug("Going to push additional resources for "+deposition.getTitle()+" ID : "+deposition.getId());
for(CkanResource r:toUpload) {
DownloadedFile downloaded=null;
try {
boolean found=false;
for(FileDeposition f:dep.getFiles())
if(f.getFilename().equals(r.getName())) {
found=true;
break;
}
if(!found) {
downloaded=Net.download(r);
z.uploadFile(dep, r.getName(),downloaded.getF());
}
for(Map.Entry<String,DownloadedFile> e : resourceMap.entrySet()){
DownloadedFile downloadedFile=e.getValue();
if(!alreadyExistingFiles.contains(e.getKey()))
try{
// Upload new file
z.uploadFile(dep,downloadedFile.getToUseFileName(),downloadedFile.getFile());
}catch(Throwable t) {
log.warn("Unable to upload "+r.getName()+".",t);
log.warn("Unable to upload "+downloadedFile.getSource().getName(),t);
}
}
return z.readDeposition(dep.getId());
}
}

View File

@ -1,83 +0,0 @@
package org.gcube.data.publishing.ckan2zenodo.commons;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.gcube.data.publishing.ckan2zenodo.model.CkanResource;
import org.gcube.data.publishing.ckan2zenodo.model.DownloadedFile;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class Net {
private static final Pattern FILENAME_IN_DEPOSITION_REGEXP = Pattern.compile("(?<=filename\\=\\\").*(?=\\\")");
public static DownloadedFile download(CkanResource toDownload) throws Exception {
String urlString=toDownload.getUrl();
log.debug("Downloading "+urlString);
//Download locally into temp
URL url=new URL(urlString);
File temp=File.createTempFile("zenodo_", ".tmp");
MessageDigest md = MessageDigest.getInstance("MD5");
// Multiple tries
InputStream is=null;
int attempt=0;
Exception lastException=null;
String remoteFileName=null;
while(is==null&&attempt<5) {
try {
attempt++;
is=url.openStream();
if(remoteFileName == null)
remoteFileName = getFilenameFromURL(url);
}catch(Exception e) {
lastException=e;
try{
Thread.sleep(500*attempt);
}catch(InterruptedException e1) {}
}
}
if(is==null) throw new Exception("Unable to download "+urlString,lastException);
if(remoteFileName == null)
remoteFileName = ""; // Unable to evaluate from HEAD
DigestInputStream dis = new DigestInputStream(is, md);
// Download
long size=Files.copy(is, temp.toPath(),StandardCopyOption.REPLACE_EXISTING);
return new DownloadedFile(toDownload,temp,dis.getMessageDigest().toString(),remoteFileName);
}
private static final String getFilenameFromURL(URL url) throws IOException {
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestMethod("GET");
String contentDisp= con.getHeaderField("Content-Disposition");
Matcher m = FILENAME_IN_DEPOSITION_REGEXP.matcher(contentDisp);
m.find();
return m.group(0);
}
}

View File

@ -1,29 +1,55 @@
package org.gcube.data.publishing.ckan2zenodo.model;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.Setter;
import lombok.ToString;
import lombok.*;
import lombok.extern.slf4j.Slf4j;
@Getter
@Setter
@AllArgsConstructor
@ToString
@RequiredArgsConstructor
@Slf4j
public class DownloadedFile {
private static final Pattern FILENAME_IN_DEPOSITION_REGEXP = Pattern.compile("(?<=filename\\=\\\").*(?=\\\")");
@NonNull
@Getter
private CkanResource source;
private File f;
private String MD5;
private String remoteFileName;
//private File f;
private File f=null;
private String MD5=null;
private String remoteFileName=null;
public String getToUseFileName(){
@Override
public String toString() {
return "DownloadedFile{" +
"source=" + source +
", f=" + f +
", MD5='" + MD5 + '\'' +
", remoteFileName='" + remoteFileName + '\'' +
'}';
}
public String getToUseFileName() throws Exception {
if (getExtension(source.getName())!=null){
// source contains extension
return source.getName();
}else {
if(remoteFileName == null){
initRemoteFileName();
}
String evaluatedExtension=getExtension(remoteFileName);
if(evaluatedExtension!=null)
return source.getName()+evaluatedExtension;
@ -31,6 +57,26 @@ public class DownloadedFile {
}
}
public File getFile() throws Exception {
if(f==null){
download();
}
return f;
}
public String getMD5() throws Exception {
if(MD5==null){
download();
}
return MD5;
}
static final String getExtension(String filename){
int lastIndexOf = filename.lastIndexOf(".");
if (lastIndexOf == -1) {
@ -38,4 +84,84 @@ public class DownloadedFile {
}
return filename.substring(lastIndexOf);
}
/**
* Actually downlaods the file INIT File, MD5 and remoteFilename.
*/
private void download() throws Exception {
log.info("Downloading {} from {}",source.getName(),source.getUrl());
URL url=new URL(source.getUrl());
f=File.createTempFile("zenodo_", ".tmp");
MessageDigest md = MessageDigest.getInstance("MD5");
// Multiple tries
InputStream is=null;
int attempt=0;
Exception lastException=null;
while(is==null&&attempt<5) {
try {
attempt++;
is=url.openStream();
if(remoteFileName == null)
remoteFileName = getFilenameFromURL(url);
}catch(Exception e) {
lastException=e;
try{
Thread.sleep(500*attempt);
}catch(InterruptedException e1) {}
}
}
if(is==null) throw new Exception("Unable to download "+source.getUrl(),lastException);
if(remoteFileName == null)
remoteFileName = ""; // Unable to evaluate from HEAD
DigestInputStream dis = new DigestInputStream(is, md);
MD5 = dis.getMessageDigest().toString();
// Download
long size= Files.copy(is, f.toPath(), StandardCopyOption.REPLACE_EXISTING);
log.info("Received {} bytes for {} ",size,source.getName());
}
/**
* Performs HTTP HEAD and INIT remoteFileName
*/
private void initRemoteFileName() throws Exception {
int attempt =0;
Exception lastException=null;
URL url=new URL(source.getUrl());
while(remoteFileName==null&&attempt<5) {
try {
attempt++;
remoteFileName = getFilenameFromURL(url);
}catch(Exception e) {
lastException=e;
try{
Thread.sleep(500*attempt);
}catch(InterruptedException e1) {}
}
}
if(remoteFileName == null) {
remoteFileName = ""; // Unable to evaluate from HEAD
log.warn("Unable to get remote file name from {} [resource Name {}]",source.getUrl(),source.getName(),lastException);
}
}
private static final String getFilenameFromURL(URL url) throws IOException {
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestMethod("GET");
String contentDisp= con.getHeaderField("Content-Disposition");
Matcher m = FILENAME_IN_DEPOSITION_REGEXP.matcher(contentDisp);
m.find();
return m.group(0);
}
}

View File

@ -10,7 +10,6 @@ import org.gcube.common.resources.gcore.GenericResource;
import org.gcube.common.resources.gcore.Resources;
import org.gcube.data.publishing.ckan2zenodo.clients.Zenodo;
import org.gcube.data.publishing.ckan2zenodo.commons.IS;
import org.gcube.data.publishing.ckan2zenodo.commons.Net;
import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor;
import org.gcube.data.publishing.ckan2zenodo.model.CkanRelatedIdentifier;
import org.gcube.data.publishing.ckan2zenodo.model.CkanResource;
@ -88,8 +87,8 @@ public class ZenodoTests {
CkanItemDescriptor desc=new CkanItemDescriptor(json);
for(CkanResource cRes:tran.filterResources(desc)) {
DownloadedFile f=Net.download(cRes);
FileDeposition file=z.uploadFile(dep, cRes.getName(), f.getF());
DownloadedFile f=new DownloadedFile(cRes);
FileDeposition file=z.uploadFile(dep, cRes.getName(), f.getFile());
System.out.println("Published "+file);
}
@ -116,9 +115,9 @@ public class ZenodoTests {
CkanItemDescriptor desc=new CkanItemDescriptor(json);
for(CkanResource cRes:tran.filterResources(desc)) {
DownloadedFile f=Net.download(cRes);
DownloadedFile f=new DownloadedFile(cRes);
FileDeposition file=z.uploadFile(dep, cRes.getName(), f.getF());
FileDeposition file=z.uploadFile(dep, cRes.getName(), f.getFile());
System.out.println("Published "+file);
}
dep=z.publish(dep);

View File

@ -1,6 +1,5 @@
package org.gcube.data.publishing.ckan2zenodo.model;
import org.gcube.data.publishing.ckan2zenodo.commons.Net;
import org.junit.Test;
import static junit.framework.TestCase.assertTrue;
@ -17,28 +16,28 @@ public class NetTests {
// PDF URL
res.setUrl("https://data-pre.d4science.net/RgA7");
check(Net.download(res),"Deliverable.pdf",true);
check(new DownloadedFile(res),"Deliverable.pdf",true);
// Do not use HEAD if extension in resource name
res.setName("Deliverable.rtf");
check(Net.download(res),res.getName(),true);
check(new DownloadedFile(res),res.getName(),true);
//Check invalid urls i.e. folder url == UNABLE TO GET FILENAME FROM HEAD
res.setUrl("http://data-pre.d4science.org/workspace-explorer-app?folderId=UjV1MTJ4K2lvQU5MRE1MT2NCOEVGWDkvMG5SL2dwY3A0QmpWZmdRVEFxR3Njd2cwcUxUQ3BBZzZxa1FhN3JQTQ");
// Still should use resource name
check(Net.download(res),res.getName(),true);
check(new DownloadedFile(res),res.getName(),true);
//Check invalid urls i.e. folder url == UNABLE TO GET FILENAME FROM HEAD
res.setName("Deliverable");
//Shouldn't have extension
check(Net.download(res),res.getName(),false);
check(new DownloadedFile(res),res.getName(),false);
}
private static final void check(DownloadedFile f, String expectedFilename, boolean expectExtension){
private static final void check(DownloadedFile f, String expectedFilename, boolean expectExtension) throws Exception {
System.out.println(f);
System.out.println("Resulting filename is : "+f.getToUseFileName());
assertTrue(f.getRemoteFileName()!=null);
assertTrue(f.getToUseFileName()!=null);
if(expectExtension)
assertTrue(DownloadedFile.getExtension(f.getToUseFileName())!=null);