Added regexp management

This commit is contained in:
Fabio Sinibaldi 2019-12-03 17:20:12 +01:00
parent 7a8e79eff8
commit 681f82e7c6
5 changed files with 119 additions and 25 deletions

View File

@ -3,17 +3,19 @@ package org.gcube.data.publishing.ckan2zenodo;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.gcube.data.publishing.ckan2zenodo.commons.Parsing;
import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor;
import org.gcube.data.publishing.ckan2zenodo.model.Mapping;
import org.gcube.data.publishing.ckan2zenodo.model.Mapping.Regexp;
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.Contributor;
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.DepositionMetadata;
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.DepositionMetadata.AccessRights;
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.RelatedIdentifier;
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.ZenodoDeposition;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
@ -102,8 +104,32 @@ public class Transformer {
List<String> sourceValues=sourceCtx.read(mapping.getSource());
if(sourceValues!=null)
for(String sourceValue:sourceValues) {
String targetValue =mapping.getValueMapping().getOrDefault(sourceValue, sourceValue);
targetCtx.add(mapping.getTarget(),targetValue);
String resultingValue=sourceValue;
// apply regexps
for(Regexp regexp:mapping.getRegexp()) {
switch(regexp.getType()) {
case extract : {
Pattern p=Pattern.compile(regexp.getTarget());
Matcher m = p.matcher(resultingValue);
if(m.find())
resultingValue=m.group();
else resultingValue=null;
break;
}
case replace : {
resultingValue=resultingValue.replaceAll(regexp.getTarget(), regexp.getReplacement());
break;
}
}
}
// apply value mapping
resultingValue =mapping.getValueMapping().getOrDefault(sourceValue, sourceValue);
targetCtx.add(mapping.getTarget(),resultingValue);
}
}

View File

@ -10,6 +10,7 @@ import javax.xml.parsers.ParserConfigurationException;
import org.gcube.common.resources.gcore.GenericResource;
import org.gcube.data.publishing.ckan2zenodo.commons.IS;
import org.gcube.data.publishing.ckan2zenodo.model.Mapping;
import org.gcube.data.publishing.ckan2zenodo.model.Mapping.Regexp;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
@ -42,36 +43,52 @@ public class TransformerManager {
private static ArrayList<Mapping> readMappings(GenericResource res){
// ByteArrayInputStream input = new ByteArrayInputStream(
// res.profile()..toString().getBytes("UTF-8"));
// Document doc = builder.parse(input);
// XPath xPath = XPathFactory.newInstance().newXPath();
// String expression = "/class/student";
// NodeList nodeList = (NodeList) xPath.compile(expression).evaluate(
// doc, XPathConstants.NODESET);
// ByteArrayInputStream input = new ByteArrayInputStream(
// res.profile()..toString().getBytes("UTF-8"));
// Document doc = builder.parse(input);
// XPath xPath = XPathFactory.newInstance().newXPath();
// String expression = "/class/student";
// NodeList nodeList = (NodeList) xPath.compile(expression).evaluate(
// doc, XPathConstants.NODESET);
ArrayList<Mapping> toReturn=new ArrayList<Mapping>();
Element root=res.profile().body();
NodeList mappings=root.getElementsByTagName("mapping");
for(int i = 0; i<mappings.getLength();i++) {
Element mapping=(Element) mappings.item(i);
String source=mapping.getElementsByTagName("source").item(0).getTextContent();
String target=mapping.getElementsByTagName("target").item(0).getTextContent();
HashMap<String,String> values=new HashMap<>();
NodeList valueMappings=mapping.getElementsByTagName("valueMapping");
for(int j = 0; i<valueMappings.getLength();j++) {
String sourceValue=mapping.getElementsByTagName("sourceValue").item(0).getTextContent();
String targetValue=mapping.getElementsByTagName("targetValue").item(0).getTextContent();
Element codelistMapping=(Element) valueMappings.item(j);
String sourceValue=codelistMapping.getElementsByTagName("sourceValue").item(0).getTextContent();
String targetValue=codelistMapping.getElementsByTagName("targetValue").item(0).getTextContent();
values.put(sourceValue, targetValue);
}
toReturn.add(new Mapping(source,target,values));
ArrayList<Regexp> regularExpressions=new ArrayList<>();
NodeList regexpDeclarations=mapping.getElementsByTagName("regexp");
for(int j = 0; i<regexpDeclarations.getLength();j++) {
Element regexpElement=(Element) regexpDeclarations.item(j);
String regexpTarget=regexpElement.getElementsByTagName("target").item(0).getTextContent();
String typeName=regexpElement.getAttribute("type");
Regexp regexp=new Regexp(Regexp.Type.valueOf(typeName),regexpTarget);
if(regexp.getType().equals(Regexp.Type.replace))
regexp.setReplacement(regexpElement.getElementsByTagName("replacement").item(0).getTextContent());
regularExpressions.add(regexp);
}
toReturn.add(new Mapping(source,target,values,regularExpressions));
}
return toReturn;
}
}

View File

@ -1,6 +1,7 @@
package org.gcube.data.publishing.ckan2zenodo.model;
import java.util.HashMap;
import java.util.List;
import lombok.Getter;
import lombok.NonNull;
@ -9,16 +10,30 @@ import lombok.Setter;
@RequiredArgsConstructor
@Getter
@Setter
public class Mapping {
@RequiredArgsConstructor
@Getter
public static class Regexp{
public static enum Type{
replace,extract
}
@NonNull
private Type type;
@NonNull
private String target;
@Setter
private String replacement;
}
@NonNull
private String source;
@NonNull
private String target;
@NonNull
private HashMap<String,String> valueMapping;
private String regexp;
@NonNull
private List<Regexp> regexp;
}

View File

@ -2,12 +2,10 @@ package org.gcube.tests;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import org.gcube.data.publishing.ckan2zenodo.Fixer;
import org.gcube.data.publishing.ckan2zenodo.Transformer;
import org.gcube.data.publishing.ckan2zenodo.model.CkanItemDescriptor;
import org.gcube.data.publishing.ckan2zenodo.model.Mapping;
import org.gcube.data.publishing.ckan2zenodo.model.zenodo.ZenodoDeposition;
import org.junit.BeforeClass;
import org.junit.Test;

View File

@ -0,0 +1,38 @@
<Resource version="0.4.x">
<ID>4adeaca2-8e32-4507-8937-d891629998e2</ID>
<Type>GenericResource</Type>
<Scopes></Scopes>
<Profile>
<SecondaryType>Ckan-Zenodo-Mappings</SecondaryType>
<Name>ResearchObject</Name>
<Description>Simple mappings tests</Description>
<Body>
<mappings>
<mapping>
<source>$.extras[?(@.key=='Author')].value</source>
<target>$.metadata.contributors[0].name</target>
<regexp type="extract">
<target>([A-Za-z]*, [A-Za-z]*)(?=,)</target>
</regexp>
</mapping>
<mapping>
<source>$.extras[?(@.key=='Author')].value</source>
<target>$.metadata.contributors[0].type</target>
<regexp type="replace">
<target>.*</target>
<replacement>Producer</replacement>
</regexp>
</mapping>
<mapping>
<source>$.extras[?(@.key=='Author')].value</source>
<target>$.metadata.contributors[0].orcid</target>
<regexp type="extract">
<target>orcid.org/.*</target>
</regexp>
</mapping>
</mappings>
</Body>
</Profile>
</Resource>