ckan2zenodo-library/src/main/java/org/gcube/data/publishing/ckan2zenodo/Translator.java

284 lines
8.7 KiB
Java
Raw Normal View History

2019-11-27 18:21:01 +01:00
package org.gcube.data.publishing.ckan2zenodo;
2020-06-18 12:21:43 +02:00
import java.time.Instant;
2019-12-02 18:02:11 +01:00
import java.util.ArrayList;
2020-01-23 18:03:28 +01:00
import java.util.Collection;
2019-12-10 17:06:52 +01:00
import java.util.Collections;
2020-06-18 12:21:43 +02:00
import java.util.Date;
2020-02-06 17:58:11 +01:00
import java.util.HashSet;
2019-12-03 12:50:36 +01:00
import java.util.List;
2019-12-10 17:06:52 +01:00
import java.util.Map;
2019-12-03 17:20:12 +01:00
import java.util.regex.Matcher;
import java.util.regex.Pattern;
2019-12-02 18:02:11 +01:00
2019-12-03 12:50:36 +01:00
import org.gcube.data.publishing.ckan2zenodo.commons.Parsing;
2019-11-27 18:21:01 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor;
2019-12-10 17:06:52 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.CkanResource;
2019-12-05 11:56:51 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.faults.TransformationException;
2019-12-10 17:06:52 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.parsing.Mapping;
import org.gcube.data.publishing.ckan2zenodo.model.parsing.Mapping.Regexp;
import org.gcube.data.publishing.ckan2zenodo.model.parsing.Mapping.Source.Value;
import org.gcube.data.publishing.ckan2zenodo.model.parsing.ResourceFilter;
2019-12-10 17:06:52 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.parsing.ResourceFilter.Filter;
2019-12-02 18:02:11 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.Contributor;
2020-06-18 12:21:43 +02:00
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.Creator;
2019-12-02 18:02:11 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.DepositionMetadata;
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.DepositionMetadata.AccessRights;
2020-06-18 12:21:43 +02:00
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.DepositionMetadata.UploadType;
2019-12-03 12:50:36 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.RelatedIdentifier;
2019-11-27 18:21:01 +01:00
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.ZenodoDeposition;
2019-12-03 12:50:36 +01:00
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
2019-12-02 18:02:11 +01:00
import lombok.extern.slf4j.Slf4j;
2019-12-10 17:06:52 +01:00
import net.minidev.json.JSONArray;
import net.minidev.json.JSONObject;
2019-12-02 18:02:11 +01:00
@Slf4j
2019-12-03 12:50:36 +01:00
@RequiredArgsConstructor
2019-12-10 17:06:52 +01:00
public class Translator {
2019-11-27 18:21:01 +01:00
2019-12-03 12:50:36 +01:00
@NonNull
private List<Mapping> mappings;
2020-06-24 16:47:01 +02:00
2019-12-10 17:06:52 +01:00
@NonNull
private ResourceFilter resourceFilter;
2019-12-05 11:56:51 +01:00
2019-12-10 17:06:52 +01:00
public Translator() {
this(Collections.EMPTY_LIST,new ResourceFilter(Collections.EMPTY_LIST));
}
2020-06-24 16:47:01 +02:00
2019-12-10 17:06:52 +01:00
public Translator(List<Mapping> mappings) {
this(mappings,new ResourceFilter(Collections.EMPTY_LIST));
}
2019-12-05 11:56:51 +01:00
public ZenodoDeposition transform(CkanItemDescriptor toTransform, ZenodoDeposition deposition) throws TransformationException {
2019-12-02 18:02:11 +01:00
log.debug("Transforming "+toTransform+". Existing Deposition is : "+deposition);
2019-12-05 11:56:51 +01:00
2019-12-02 18:02:11 +01:00
if(deposition==null) deposition=new ZenodoDeposition();
2019-12-05 11:56:51 +01:00
2019-12-02 18:02:11 +01:00
// default mappings
DepositionMetadata meta=deposition.getMetadata();
if(meta==null)
meta=new DepositionMetadata();
2019-12-05 11:56:51 +01:00
2020-06-18 12:21:43 +02:00
// UPLOAD TYPE
meta.setUpload_type(UploadType.other);
2020-06-24 16:47:01 +02:00
2020-06-18 12:21:43 +02:00
// TITLE
2019-12-02 18:02:11 +01:00
meta.setTitle(toTransform.getTitle());
2020-06-18 12:21:43 +02:00
// DESCRIPTION
2019-12-02 18:02:11 +01:00
meta.setDescription(toTransform.getNotes());
2019-12-05 11:56:51 +01:00
2020-06-18 12:21:43 +02:00
// ACCESS RIGHTS
2019-12-02 18:02:11 +01:00
if(toTransform.isOpen())
meta.setAccess_right(AccessRights.open);
else {
meta.setAccess_right(AccessRights.restricted);
meta.setAccess_conditions("Visit the VRE "+toTransform.getVRE()+" to access it.");
}
2019-12-05 11:56:51 +01:00
2020-06-18 12:21:43 +02:00
// LICENSE
2019-12-02 18:02:11 +01:00
meta.setLicense(toTransform.getLicenseId());
2019-12-05 11:56:51 +01:00
2020-06-18 12:21:43 +02:00
// TAGS
2019-12-02 18:02:11 +01:00
meta.setKeywords(new ArrayList<String>(toTransform.getTags()));
2019-12-05 11:56:51 +01:00
2020-06-18 12:21:43 +02:00
//RELATED IDENTIFIER
2019-12-02 18:02:11 +01:00
String itemUrl=toTransform.getItemUrl();
ArrayList<RelatedIdentifier> relatedIdentifiers=new ArrayList<>();
2020-06-18 12:21:43 +02:00
if(itemUrl!=null) relatedIdentifiers.add(new RelatedIdentifier(itemUrl,RelatedIdentifier.Relation.isCompiledBy));
2019-12-05 11:56:51 +01:00
2019-12-03 12:50:36 +01:00
meta.setRelated_identifiers(relatedIdentifiers);
2019-12-05 11:56:51 +01:00
2020-06-24 16:47:01 +02:00
2020-06-18 12:21:43 +02:00
//CONTRIBUTORS & CREATORS
2019-12-02 18:02:11 +01:00
ArrayList<Contributor> contributors=new ArrayList<>();
2020-06-18 12:21:43 +02:00
ArrayList<Creator> creators=new ArrayList<>();
2020-06-24 16:47:01 +02:00
2019-12-02 18:02:11 +01:00
String authorName=toTransform.getAuthor();
if(authorName!=null) {
Contributor author=new Contributor(Contributor.Type.Producer);
author.setName(authorName);
contributors.add(author);
2020-06-24 16:47:01 +02:00
2020-06-18 12:21:43 +02:00
creators.add(new Creator(authorName));
2020-06-24 16:47:01 +02:00
2019-12-02 18:02:11 +01:00
}
2019-12-05 11:56:51 +01:00
2019-12-02 18:02:11 +01:00
String maintainerName=toTransform.getAuthor();
if(maintainerName!=null) {
Contributor maintainer=new Contributor(Contributor.Type.DataCurator);
maintainer.setName(maintainerName);
contributors.add(maintainer);
2020-06-24 16:47:01 +02:00
2020-06-18 12:21:43 +02:00
creators.add(new Creator(authorName));
2019-12-02 18:02:11 +01:00
}
2020-06-24 16:47:01 +02:00
2020-06-18 12:21:43 +02:00
// D4Science as contributor
Contributor d4Science=new Contributor(Contributor.Type.HostingInstitution);
d4Science.setName("D4Science");
2020-06-18 13:46:19 +02:00
contributors.add(d4Science);
2020-06-24 16:47:01 +02:00
2019-12-02 18:02:11 +01:00
meta.setContributors(contributors);
2020-06-18 12:21:43 +02:00
meta.setCreators(creators);
2020-06-24 16:47:01 +02:00
2020-06-18 12:21:43 +02:00
// VERSION
2019-12-02 18:02:11 +01:00
meta.setVersion(toTransform.getVersion());
2019-12-05 11:56:51 +01:00
2020-06-18 12:21:43 +02:00
// DATES
// PUBLICATION DATE = now
meta.setPublication_date(Date.from(Instant.now()));
2020-06-24 16:47:01 +02:00
2019-12-02 18:02:11 +01:00
deposition.setMetadata(meta);
2019-12-05 11:56:51 +01:00
2019-12-02 18:02:11 +01:00
// profile specific mappings
2019-12-03 12:50:36 +01:00
return applyMappings(toTransform, deposition);
}
2019-12-05 11:56:51 +01:00
private ZenodoDeposition applyMappings(CkanItemDescriptor source, ZenodoDeposition target) throws TransformationException {
try{
ObjectMapper mapper=Parsing.getMapper();
DocumentContext sourceCtx=JsonPath.using(Parsing.JSON_PATH_ALWAYS_LIST_CONFIG).parse(source.getContent());
DocumentContext targetCtx=JsonPath.using(Parsing.JSON_PATH_ALWAYS_LIST_CONFIG).parse(mapper.writeValueAsString(target));
for(Mapping mapping:mappings) {
try {
// extract source
List<String> sourceValues=new ArrayList<>();
2020-06-24 16:47:01 +02:00
for(Value v: mapping.getSource().getValues()) {
2020-06-24 16:47:01 +02:00
String actualValue=null;
switch(v.getType()) {
case constant : {
2020-06-24 16:47:01 +02:00
actualValue=v.getValue();
break;
}
case jsonPath : {
2020-01-23 18:03:28 +01:00
for(String s: ((Collection<? extends String>) sourceCtx.read(v.getValue()))){
if(s!=null) {
s=s.trim();
2020-06-24 16:47:01 +02:00
if(!s.isEmpty())actualValue=s;
2020-01-23 18:03:28 +01:00
}
2020-06-24 16:47:01 +02:00
2020-01-23 18:03:28 +01:00
}
break;
}
}
2020-06-24 16:47:01 +02:00
// Adding to actual values
if(actualValue!=null) {
if(v.getSplit()!=null)
for(String toAdd:actualValue.split(v.getSplit()))
sourceValues.add(toAdd.trim());
else sourceValues.add(actualValue);
}
if(!sourceValues.isEmpty()) break;
2020-06-24 16:47:01 +02:00
2019-12-05 11:56:51 +01:00
}
2020-06-24 16:47:01 +02:00
2019-12-05 11:56:51 +01:00
for(String sourceValue:sourceValues) {
String resultingValue=sourceValue;
// apply regexps
for(Regexp regexp:mapping.getRegexp()) {
switch(regexp.getType()) {
case extract : {
Pattern p=Pattern.compile(regexp.getTarget());
Matcher m = p.matcher(resultingValue);
if(m.find())
resultingValue=m.group();
else resultingValue=null;
break;
}
case replace : {
if(resultingValue!=null) {
String replacement=regexp.getReplacement()!=null?regexp.getReplacement():"";
resultingValue=resultingValue.replaceAll(regexp.getTarget(), replacement);
break;
}
}
}
2019-12-03 17:20:12 +01:00
}
2019-12-05 11:56:51 +01:00
// apply value mappings
resultingValue =mapping.getValueMapping().getOrDefault(sourceValue, resultingValue);
2020-06-24 16:47:01 +02:00
2019-12-10 17:06:52 +01:00
// check if targetPath exists
List<String> targetElementFound=targetCtx.read(mapping.getTargetPath());
if(targetElementFound==null || targetElementFound.size()==0 || targetElementFound.get(0)==null) {
2020-06-24 16:47:01 +02:00
// targetCtx=targetCtx.add(mapping.getTargetPath(),Collections.singletonList("nothing"));
targetCtx=Parsing.addElement(targetCtx, mapping.getTargetPath());
2020-06-24 18:06:56 +02:00
}
2020-06-24 16:47:01 +02:00
2020-06-24 18:06:56 +02:00
if(mapping.getTargetElement().getAppend()){
String original=((List<String>)targetCtx.read(mapping.getTargetPath()+"."+mapping.getTargetElement().getTargetElement())).get(0);
if(original!=null && !original.isEmpty())
resultingValue=original+resultingValue;
}
targetCtx=targetCtx.put(mapping.getTargetPath(),mapping.getTargetElement().getTargetElement(),
resultingValue);
2019-12-03 17:20:12 +01:00
}
2019-12-05 11:56:51 +01:00
}catch(Throwable t) {
throw new TransformationException("Exception while applying "+mapping,t);
2019-12-03 12:50:36 +01:00
}
2019-12-05 11:56:51 +01:00
}
2020-01-13 15:23:36 +01:00
String serializedOutput=targetCtx.jsonString();
log.debug("Mapping complete. Going to return : "+serializedOutput);
return mapper.readValue(serializedOutput, ZenodoDeposition.class);
2019-12-05 11:56:51 +01:00
}catch(Throwable t) {
2019-12-10 17:06:52 +01:00
log.error("Unable to translate "+source+" using previous "+target,t);
2019-12-05 11:56:51 +01:00
throw new TransformationException("Unable to translate "+source.getName(),t);
2019-12-03 12:50:36 +01:00
}
2019-11-27 18:21:01 +01:00
}
2019-12-05 11:56:51 +01:00
2020-06-24 16:47:01 +02:00
2019-12-10 17:06:52 +01:00
public List<CkanResource> filterResources(CkanItemDescriptor source) throws TransformationException{
try {
2020-06-24 16:47:01 +02:00
ObjectMapper mapper=Parsing.getMapper();
DocumentContext sourceCtx=JsonPath.using(Parsing.JSON_PATH_ALWAYS_LIST_CONFIG).parse(source.getContent());
HashSet<CkanResource> toReturn=new HashSet();
for(Filter f:resourceFilter.getFilters()) {
JSONArray filtered=sourceCtx.read(f.getConditions().get(0));
for(Object obj:filtered) {
Map<String,String> map=(Map<String, String>) obj;
toReturn.add(mapper.readValue((new JSONObject(map)).toJSONString(), CkanResource.class));
}
}
return new ArrayList<CkanResource>(toReturn);
2019-12-10 17:06:52 +01:00
}catch(Throwable t) {
log.error("Unable to filter resources. ",t);
throw new TransformationException("Unable to filter "+source.getName()+" resources",t);
}
}
2019-11-27 18:21:01 +01:00
}