synch fork with master

This commit is contained in:
Miriam Baglioni 2020-02-17 18:22:27 +01:00
commit 5868ff8a86
97 changed files with 7166 additions and 731 deletions

4
.gitignore vendored
View File

@ -1,6 +1,8 @@
.DS_Store
.idea
*.iml
*.ipr
*.iws
*~
.classpath
/*/.classpath
@ -18,5 +20,5 @@
/*/build
/build
spark-warehouse
/*/*/job-override.properties
/**/job-override.properties

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>
@ -76,6 +76,41 @@
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
<plugin>
<groupId>org.eclipse.m2e</groupId>
<artifactId>lifecycle-mapping</artifactId>
<version>1.0.0</version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>
org.apache.maven.plugins
</groupId>
<artifactId>
maven-plugin-plugin
</artifactId>
<versionRange>
[3.2,)
</versionRange>
<goals>
<goal>descriptor</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore></ignore>
</action>
</pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<artifactId>dhp-build</artifactId>
<packaging>pom</packaging>

View File

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>
@ -42,6 +42,10 @@
<groupId>com.rabbitmq</groupId>
<artifactId>amqp-client</artifactId>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.lib.ExtensionFunctionCall;
import net.sf.saxon.lib.ExtensionFunctionDefinition;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.om.StructuredQName;
import net.sf.saxon.trans.XPathException;
public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
public abstract String getName();
public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException;
@Override
public StructuredQName getFunctionQName() {
return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName());
}
@Override
public ExtensionFunctionCall makeCallExpression() {
return new ExtensionFunctionCall() {
@Override
public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException {
return doCall(context, arguments);
}
};
}
}

View File

@ -0,0 +1,67 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Item;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.GregorianCalendar;
public class ExtractYear extends AbstractExtensionFunction {
private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" };
@Override
public String getName() {
return "extractYear";
}
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
return new StringValue("");
}
final Item item = arguments[0].head();
if (item == null) {
return new StringValue("");
}
return new StringValue(_year(item.getStringValue()));
}
@Override
public int getMinimumNumberOfArguments() {
return 0;
}
@Override
public int getMaximumNumberOfArguments() {
return 1;
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
}
@Override
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
return SequenceType.SINGLE_STRING;
}
private String _year(String s) {
Calendar c = new GregorianCalendar();
for (String format : dateFormats) {
try {
c.setTime(new SimpleDateFormat(format).parse(s));
String year = String.valueOf(c.get(Calendar.YEAR));
return year;
} catch (ParseException e) {}
}
return "";
}
}

View File

@ -0,0 +1,66 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class NormalizeDate extends AbstractExtensionFunction {
private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" };
private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
@Override
public String getName() {
return "normalizeDate";
}
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
return new StringValue("");
}
String s = arguments[0].head().getStringValue();
return new StringValue(_year(s));
}
@Override
public int getMinimumNumberOfArguments() {
return 0;
}
@Override
public int getMaximumNumberOfArguments() {
return 1;
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
}
@Override
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
return SequenceType.SINGLE_STRING;
}
private String _year(String s) {
final String date = s != null ? s.trim() : "";
for (String format : normalizeDateFormats) {
try {
Date parse = new SimpleDateFormat(format).parse(date);
String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
return res;
} catch (ParseException e) {}
}
return "";
}
}

View File

@ -0,0 +1,60 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Item;
import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue;
import org.apache.commons.lang3.StringUtils;
public class PickFirst extends AbstractExtensionFunction {
@Override
public String getName() {
return "pickFirst";
}
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
return new StringValue("");
}
final String s1 = getValue(arguments[0]);
final String s2 = getValue(arguments[1]);
return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
}
private String getValue(final Sequence arg) throws XPathException {
if (arg != null) {
final Item item = arg.head();
if (item != null) {
return item.getStringValue();
}
}
return "";
}
@Override
public int getMinimumNumberOfArguments() {
return 0;
}
@Override
public int getMaximumNumberOfArguments() {
return 2;
}
@Override
public SequenceType[] getArgumentTypes() {
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
}
@Override
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
return SequenceType.SINGLE_STRING;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.utils.saxon;
import net.sf.saxon.Configuration;
import net.sf.saxon.TransformerFactoryImpl;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamSource;
import java.io.StringReader;
public class SaxonTransformerFactory {
/**
* Creates the index record transformer from the given XSLT
* @param xslt
* @return
* @throws TransformerException
*/
public static Transformer newInstance(final String xslt) throws TransformerException {
final TransformerFactoryImpl factory = new TransformerFactoryImpl();
final Configuration conf = factory.getConfiguration();
conf.registerExtensionFunction(new ExtractYear());
conf.registerExtensionFunction(new NormalizeDate());
conf.registerExtensionFunction(new PickFirst());
return factory.newTransformer(new StreamSource(new StringReader(xslt)));
}
}

View File

@ -1,3 +1,11 @@
Description of the project
--------------------------
This project defines **serialization schemas** of Avro data store files that are used to pass data between workflow nodes in the system.
This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them.
Namely it defines the model for
- **research product (result)** which subclasses in publication, dataset, other research product, software
- **data source** object describing the data provider (institutional repository, aggregators, cris systems)
- **organization** research bodies managing a data source or participating to a research project
- **project** research project
Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline.

View File

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>
@ -26,6 +26,11 @@
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>

View File

@ -1,118 +0,0 @@
package eu.dnetlib.dhp.schema.dli;
import java.io.Serializable;
import java.util.List;
public class Entity implements Serializable {
private String identifier;
private List<Pid> pid;
private List<String> title;
private List<String> date;
private String typology;
private List<String> authors;
private List<Subject> subject;
private String description;
private String completionStatus;
private List<Provenance> collectedFrom;
private List<String> publisher;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public List<Pid> getPid() {
return pid;
}
public void setPid(List<Pid> pid) {
this.pid = pid;
}
public List<String> getTitle() {
return title;
}
public void setTitle(List<String> title) {
this.title = title;
}
public List<String> getDate() {
return date;
}
public void setDate(List<String> date) {
this.date = date;
}
public String getTypology() {
return typology;
}
public void setTypology(String typology) {
this.typology = typology;
}
public List<String> getAuthors() {
return authors;
}
public void setAuthors(List<String> authors) {
this.authors = authors;
}
public List<Subject> getSubject() {
return subject;
}
public void setSubject(List<Subject> subject) {
this.subject = subject;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public List<Provenance> getCollectedFrom() {
return collectedFrom;
}
public void setCollectedFrom(List<Provenance> collectedFrom) {
this.collectedFrom = collectedFrom;
}
public List<String> getPublisher() {
return publisher;
}
public void setPublisher(List<String> publisher) {
this.publisher = publisher;
}
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
}

View File

@ -1,33 +0,0 @@
package eu.dnetlib.dhp.schema.dli;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.lang3.StringUtils;
public class Pid {
private String pid;
private String pidType;
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getPidType() {
return pidType;
}
public void setPidType(String pidType) {
this.pidType = pidType;
}
public String generateId() {
if(StringUtils.isEmpty(pid) || StringUtils.isEmpty(pidType))
return null;
return DHPUtils.md5(String.format("%s::%s", pid, pidType));
}
}

View File

@ -1,35 +0,0 @@
package eu.dnetlib.dhp.schema.dli;
public class Provenance {
private String datasourceId;
private String datasourceName;
private String completionStatus;
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(String datasourceName) {
this.datasourceName = datasourceName;
}
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
}

View File

@ -1,47 +0,0 @@
package eu.dnetlib.dhp.schema.dli;
import java.io.Serializable;
import java.util.List;
public class Relation implements Serializable {
private String source;
private String target;
private List<Provenance> provenance;
private RelationSemantic semantic;
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getTarget() {
return target;
}
public void setTarget(String target) {
this.target = target;
}
public List<Provenance> getProvenance() {
return provenance;
}
public void setProvenance(List<Provenance> provenance) {
this.provenance = provenance;
}
public RelationSemantic getSemantic() {
return semantic;
}
public void setSemantic(RelationSemantic semantic) {
this.semantic = semantic;
}
}

View File

@ -1,16 +0,0 @@
package eu.dnetlib.dhp.schema.dli;
import java.io.Serializable;
public class RelationSemantic extends Subject implements Serializable {
public String inverse;
public String getInverse() {
return inverse;
}
public void setInverse(String inverse) {
this.inverse = inverse;
}
}

View File

@ -1,35 +0,0 @@
package eu.dnetlib.dhp.schema.dli;
import java.io.Serializable;
public class Subject implements Serializable {
private String schema;
private String value;
public Subject() {
}
public Subject(String schema, String value) {
this.schema = schema;
this.value = value;
}
public String getSchema() {
return schema;
}
public void setSchema(String schema) {
this.schema = schema;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -0,0 +1,15 @@
package eu.dnetlib.dhp.schema.oaf;
public class Country extends Qualifier {
private DataInfo dataInfo;
public DataInfo getDataInfo() {
return dataInfo;
}
public void setDataInfo(DataInfo dataInfo) {
this.dataInfo = dataInfo;
}
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
@ -36,7 +37,7 @@ public class GeoLocation implements Serializable {
this.place = place;
}
@JsonIgnore
public boolean isBlank() {
return StringUtils.isBlank(point) &&
StringUtils.isBlank(box) &&

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
import java.util.List;
public class Instance implements Serializable {
@ -12,7 +13,7 @@ public class Instance implements Serializable {
private KeyValue hostedby;
private String url;
private List<String> url;
// other research products specifc
private String distributionlocation;
@ -21,6 +22,14 @@ public class Instance implements Serializable {
private Field<String> dateofacceptance;
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
private Field<String> processingchargeamount;
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
private Field<String> processingchargecurrency;
private Field<String> refereed; //peer-review status
public Field<String> getLicense() {
return license;
}
@ -53,11 +62,11 @@ public class Instance implements Serializable {
this.hostedby = hostedby;
}
public String getUrl() {
public List<String> getUrl() {
return url;
}
public void setUrl(String url) {
public void setUrl(List<String> url) {
this.url = url;
}
@ -85,7 +94,29 @@ public class Instance implements Serializable {
this.dateofacceptance = dateofacceptance;
}
public Field<String> getProcessingchargeamount() {
return processingchargeamount;
}
public void setProcessingchargeamount(Field<String> processingchargeamount) {
this.processingchargeamount = processingchargeamount;
}
public Field<String> getProcessingchargecurrency() {
return processingchargecurrency;
}
public void setProcessingchargecurrency(Field<String> processingchargecurrency) {
this.processingchargecurrency = processingchargecurrency;
}
public Field<String> getRefereed() {
return refereed;
}
public void setRefereed(Field<String> refereed) {
this.refereed = refereed;
}
public String toComparableString(){
return String.format("%s::%s::%s::%s",

View File

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
@ -40,6 +41,7 @@ public class KeyValue implements Serializable {
return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
}
@JsonIgnore
public boolean isBlank() {
return StringUtils.isBlank(key) && StringUtils.isBlank(value);
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
@ -50,6 +51,8 @@ public class Qualifier implements Serializable {
schemeid != null ? schemeid : "",
schemename != null ? schemename : "");
}
@JsonIgnore
public boolean isBlank() {
return StringUtils.isBlank(classid) &&
StringUtils.isBlank(classname) &&

View File

@ -1,10 +1,8 @@
package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import java.util.Comparator;
import java.util.List;
public abstract class Result extends OafEntity implements Serializable {
@ -16,7 +14,7 @@ public abstract class Result extends OafEntity implements Serializable {
// common fields
private Qualifier language;
private List<Qualifier> country;
private List<Country> country;
private List<StructuredProperty> subject;
@ -44,16 +42,10 @@ public abstract class Result extends OafEntity implements Serializable {
private List<Field<String>> coverage;
private Field<String> refereed; //peer-review status
private Qualifier bestaccessright;
private List<Context> context;
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
private Field<String> processingchargeamount;
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
private Field<String> processingchargecurrency;
private List<ExternalReference> externalReference;
private List<Instance> instance;
@ -82,11 +74,11 @@ public abstract class Result extends OafEntity implements Serializable {
this.language = language;
}
public List<Qualifier> getCountry() {
public List<Country> getCountry() {
return country;
}
public void setCountry(List<Qualifier> country) {
public void setCountry(List<Country> country) {
this.country = country;
}
@ -194,12 +186,12 @@ public abstract class Result extends OafEntity implements Serializable {
this.coverage = coverage;
}
public Field<String> getRefereed() {
return refereed;
public Qualifier getBestaccessright() {
return bestaccessright;
}
public void setRefereed(Field<String> refereed) {
this.refereed = refereed;
public void setBestaccessright(Qualifier bestaccessright) {
this.bestaccessright = bestaccessright;
}
public List<Context> getContext() {
@ -226,24 +218,6 @@ public abstract class Result extends OafEntity implements Serializable {
this.instance = instance;
}
public Field<String> getProcessingchargeamount() {
return processingchargeamount;
}
public Result setProcessingchargeamount(Field<String> processingchargeamount) {
this.processingchargeamount = processingchargeamount;
return this;
}
public Field<String> getProcessingchargecurrency() {
return processingchargecurrency;
}
public Result setProcessingchargecurrency(Field<String> processingchargecurrency) {
this.processingchargecurrency = processingchargecurrency;
return this;
}
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
@ -287,19 +261,9 @@ public abstract class Result extends OafEntity implements Serializable {
coverage = mergeLists(coverage, r.getCoverage());
if (r.getRefereed() != null && compareTrust(this, r) < 0)
refereed = r.getRefereed();
context = mergeLists(context, r.getContext());
if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0)
processingchargeamount = r.getProcessingchargeamount();
if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0)
processingchargecurrency = r.getProcessingchargecurrency();
externalReference = mergeLists(externalReference, r.getExternalReference());
}
@ -314,5 +278,4 @@ public abstract class Result extends OafEntity implements Serializable {
return a.size() > b.size() ? a : b;
}
}

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>
@ -26,6 +26,12 @@
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
@ -45,6 +51,17 @@
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.2.10</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>

View File

@ -0,0 +1,236 @@
package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.codehaus.jackson.map.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class AbstractMigrationExecutor implements Closeable {
private final AtomicInteger counter = new AtomicInteger(0);
private final Text key = new Text();
private final Text value = new Text();
private final ObjectMapper objectMapper = new ObjectMapper();
private final SequenceFile.Writer writer;
private static final Log log = LogFactory.getLog(AbstractMigrationExecutor.class);
public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception {
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s, nameNode=%s, user=%s", hdfsPath, hdfsNameNode, hdfsUser));
this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
}
private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException {
final Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("HADOOP_USER_NAME", hdfsUser);
System.setProperty("hadoop.home.dir", "/");
FileSystem.get(URI.create(hdfsNameNode), conf);
return conf;
}
protected void emitOaf(final Oaf oaf) {
try {
key.set(counter.getAndIncrement() + ":" + oaf.getClass().getSimpleName().toLowerCase());
value.set(objectMapper.writeValueAsString(oaf));
writer.append(key, value);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
@Override
public void close() throws IOException {
writer.hflush();
writer.close();
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); }
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
if (value == null || StringUtils.isBlank(value.toString())) { return null; }
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
}
public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static StructuredProperty structuredProperty(final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) {
if (value == null) { return null; }
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
} else {
return null;
}
}
public static DataInfo dataInfo(final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(final int prefix, final String originalId) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
}

View File

@ -0,0 +1,439 @@
package eu.dnetlib.dhp.migration;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
protected final Map<String, String> code2name = new HashMap<>();
protected final MdstoreClient mdstoreClient;
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
private static final Log log = LogFactory.getLog(AbstractMongoExecutor.class);
public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
final String mongoDb, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser);
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
loadClassNames(dbUrl, dbUser, dbPassword);
final Map<String, String> nsContext = new HashMap<>();
registerNamespaces(nsContext);
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
}
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
log.info("Loading vocabulary terms from db...");
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
code2name.clear();
dbClient.processResults("select code, name from class", rs -> {
try {
code2name.put(rs.getString("code"), rs.getString("name"));
} catch (final SQLException e) {
e.printStackTrace();
}
});
}
log.info("Found " + code2name.size() + " terms.");
}
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
log.info(String.format("Searching mdstores (format: %s, layout: %s, interpretation: %s)", mdFormat, mdLayout, mdInterpretation));
final Map<String, String> colls = mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation);
log.info("Found " + colls.size() + " mdstores");
for (final Entry<String, String> entry : colls.entrySet()) {
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
final String currentColl = entry.getValue();
for (final String xml : mdstoreClient.listRecords(currentColl)) {
final Document doc = DocumentHelper.parseText(xml);
final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
: keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
final DataInfo info = prepareDataInfo(doc);
final long lastUpdateTimestamp = new Date().getTime();
for (final Oaf oaf : createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp)) {
emitOaf(oaf);
}
}
}
log.info("All Done.");
}
protected void registerNamespaces(final Map<String, String> nsContext) {
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
}
protected List<Oaf> createOafs(final Document doc,
final String type,
final KeyValue collectedFrom,
final KeyValue hostedBy,
final DataInfo info,
final long lastUpdateTimestamp) {
final List<Oaf> oafs = new ArrayList<>();
switch (type.toLowerCase()) {
case "":
case "publication":
final Publication p = new Publication();
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
p.setJournal(prepareJournal(doc, info));
oafs.add(p);
break;
case "dataset":
final Dataset d = new Dataset();
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
d.setStoragedate(prepareDatasetStorageDate(doc, info));
d.setDevice(prepareDatasetDevice(doc, info));
d.setSize(prepareDatasetSize(doc, info));
d.setVersion(prepareDatasetVersion(doc, info));
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
oafs.add(d);
break;
case "software":
final Software s = new Software();
populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
s.setLicense(prepareSoftwareLicenses(doc, info));
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
oafs.add(s);
break;
case "otherresearchproducts":
default:
final OtherResearchProduct o = new OtherResearchProduct();
populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
o.setTool(prepareOtherResearchProductTools(doc, info));
oafs.add(o);
break;
}
if (!oafs.isEmpty()) {
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
}
return oafs;
}
private List<Oaf> addProjectRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
final List<Oaf> res = new ArrayList<>();
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
for (final Object o : doc.selectNodes("//oaf:projectid")) {
final String projectId = createOpenaireId(40, ((Node) o).getText());
final Relation r1 = new Relation();
r1.setRelType("resultProject");
r1.setSubRelType("outcome");
r1.setRelClass("isProducedBy");
r1.setSource(docId);
r1.setTarget(projectId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
final Relation r2 = new Relation();
r2.setRelType("resultProject");
r2.setSubRelType("outcome");
r2.setRelClass("produces");
r2.setSource(projectId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
}
return res;
}
protected abstract List<Oaf> addOtherResultRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp);
private void populateResultFields(final Result r,
final Document doc,
final KeyValue collectedFrom,
final KeyValue hostedBy,
final DataInfo info,
final long lastUpdateTimestamp) {
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier")));
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(prepareAuthors(doc, info));
r.setLanguage(prepareLanguages(doc));
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setSubject(prepareSubjects(doc, info));
r.setTitle(prepareTitles(doc, info));
r.setRelevantdate(prepareRelevantDates(doc, info));
r.setDescription(prepareDescriptions(doc, info));
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
r.setSource(prepareSources(doc, info));
r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info));
r.setResourcetype(prepareResourceType(doc, info));
r.setCoverage(prepareCoverages(doc, info));
r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
}
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info);
protected abstract Field<String> preparePublisher(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
protected abstract Qualifier prepareLanguages(Document doc);
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductTools(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(Document doc, DataInfo info);
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(Document doc, DataInfo info);
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) {
final String name = n.getText();
final String issnPrinted = n.valueOf("@issn");
final String issnOnline = n.valueOf("@eissn");
final String issnLinking = n.valueOf("@lissn");
final String ep = n.valueOf("@ep");
final String iss = n.valueOf("@iss");
final String sp = n.valueOf("@sp");
final String vol = n.valueOf("@vol");
final String edition = n.valueOf("@edition");
if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); }
}
return null;
}
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) {
final String classId = node.valueOf(xpath);
final String className = code2name.get(classId);
return qualifier(classId, className, schemeId, schemeName);
}
protected List<StructuredProperty> prepareListStructProps(final Node node,
final String xpath,
final String xpathClassId,
final String schemeId,
final String schemeName,
final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId);
final String className = code2name.get(classId);
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
}
return res;
}
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), qualifier, info));
}
return res;
}
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
.valueOf("@schemename"), info));
}
return res;
}
protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
if (n == null) { return null; }
final String identifier = n.valueOf("./*[local-name()='identifier']");
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
final String harvestDate = n.valueOf("@harvestDate");;
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
}
protected DataInfo prepareDataInfo(final Document doc) {
final Node n = doc.selectSingleNode("//oaf:datainfo");
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
final String trust = n.valueOf("./oaf:trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
}
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
return field(node.valueOf(xpath), info);
}
protected List<Field<String>> prepareListFields(final Node node, final String xpath, final DataInfo info) {
return listFields(info, prepareListString(node, xpath));
}
protected List<String> prepareListString(final Node node, final String xpath) {
final List<String> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) {
final String s = ((Node) o).getText().trim();
if (StringUtils.isNotBlank(s)) {
res.add(s);
}
}
return res;
}
@Override
public void close() throws IOException {
super.close();
mdstoreClient.close();
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.function.Consumer;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
private Connection connection;
public DbClient(final String address, final String login, final String password) {
try {
Class.forName("org.postgresql.Driver");
this.connection =
StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address);
this.connection.setAutoCommit(false);
} catch (final Exception e) {
log.error(e.getClass().getName() + ": " + e.getMessage());
throw new RuntimeException(e);
}
log.info("Opened database successfully");
}
public void processResults(final String sql, final Consumer<ResultSet> consumer) {
try (final Statement stmt = connection.createStatement()) {
stmt.setFetchSize(100);
try (final ResultSet rs = stmt.executeQuery(sql)) {
while (rs.next()) {
consumer.accept(rs);
}
} catch (final SQLException e) {
throw new RuntimeException(e);
}
} catch (final SQLException e1) {
throw new RuntimeException(e1);
}
}
@Override
public void close() throws IOException {
try {
connection.close();
} catch (final SQLException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.migration;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
public class ExtractEntitiesFromHDFSJob {
private static List<String> folderNames = Arrays.asList("db_entities", "oaf_entities", "odf_entities");
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(ExtractEntitiesFromHDFSJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final String sourcePath = parser.get("sourcePath");
final String targetPath = parser.get("graphRawPath");
final String entity = parser.get("entity");
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<String> inputRdd = sc.emptyRDD();
folderNames.forEach(p -> inputRdd.union(
sc.sequenceFile(sourcePath+"/"+p, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.filter(k -> isEntityType(k._1(), entity))
.map(Tuple2::_2))
);
inputRdd.saveAsTextFile(targetPath+"/"+entity);
}
private static boolean isEntityType(final String item, final String entity) {
return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity);
}
}

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bson.Document;
import com.google.common.collect.Iterables;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
public class MdstoreClient implements Closeable {
private final MongoClient client;
private final MongoDatabase db;
private static final String COLL_METADATA = "metadata";
private static final String COLL_METADATA_MANAGER = "metadataManager";
private static final Log log = LogFactory.getLog(MdstoreClient.class);
public MdstoreClient(final String baseUrl, final String dbName) {
this.client = new MongoClient(new MongoClientURI(baseUrl));
this.db = getDb(client, dbName);
}
public Map<String, String> validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) {
final Map<String, String> transactions = new HashMap<>();
for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) {
final String mdId = entry.getString("mdId");
final String currentId = entry.getString("currentId");
if (StringUtils.isNoneBlank(mdId, currentId)) {
transactions.put(mdId, currentId);
}
}
final Map<String, String> res = new HashMap<>();
for (final Document entry : getColl(db, COLL_METADATA, true).find()) {
if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout)
&& entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) {
res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId")));
}
}
return res;
}
private MongoDatabase getDb(final MongoClient client, final String dbName) {
if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
log.warn(err);
throw new RuntimeException(err);
}
return client.getDatabase(dbName);
}
private MongoCollection<Document> getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) {
if (!Iterables.contains(db.listCollectionNames(), collName)) {
final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
log.warn(err);
if (abortIfMissing) {
throw new RuntimeException(err);
} else {
return null;
}
}
return db.getCollection(collName);
}
public Iterable<String> listRecords(final String collName) {
final MongoCollection<Document> coll = getColl(db, collName, false);
return coll == null ? new ArrayList<>()
: () -> StreamSupport.stream(coll.find().spliterator(), false)
.filter(e -> e.containsKey("body"))
.map(e -> e.getString("body"))
.iterator();
}
@Override
public void close() throws IOException {
client.close();
}
}

View File

@ -0,0 +1,520 @@
package eu.dnetlib.dhp.migration;
import java.io.Closeable;
import java.io.IOException;
import java.sql.Array;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.function.Consumer;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable {
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
private final DbClient dbClient;
private final long lastUpdateTimestamp;
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json")));
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("namenode");
final String hdfsUser = parser.get("hdfsUser");
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) {
log.info("Processing datasources...");
smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
log.info("Processing projects...");
smdbe.execute("queryProjects.sql", smdbe::processProject);
log.info("Processing orgs...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
log.info("Processing relations ds <-> orgs ...");
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
log.info("Processing projects <-> orgs ...");
smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization);
log.info("All done.");
}
}
public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser);
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.lastUpdateTimestamp = new Date().getTime();
}
public void execute(final String sqlFile, final Consumer<ResultSet> consumer) throws Exception {
final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile));
dbClient.processResults(sql, consumer);
}
public void processDatasource(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Datasource ds = new Datasource();
ds.setId(createOpenaireId(10, rs.getString("datasourceid")));
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
ds.setPid(new ArrayList<>());
ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
ds.setDateoftransformation(null); // Value not returned by the SQL query
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
ds.setOaiprovenance(null); // Values not present in the DB
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
ds.setOfficialname(field(rs.getString("officialname"), info));
ds.setEnglishname(field(rs.getString("englishname"), info));
ds.setWebsiteurl(field(rs.getString("websiteurl"), info));
ds.setLogourl(field(rs.getString("logourl"), info));
ds.setContactemail(field(rs.getString("contactemail"), info));
ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info));
ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info));
ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info));
ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info));
ds.setDescription(field(rs.getString("description"), info));
ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info));
ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info));
ds.setOdpolicies(field(rs.getString("odpolicies"), info));
ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info));
ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info));
ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info));
ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info));
ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info));
ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info));
ds.setDataprovider(field(rs.getBoolean("dataprovider"), info));
ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info));
ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info));
ds.setDatauploadtype(field(rs.getString("datauploadtype"), info));
ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info));
ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info));
ds.setVersioning(field(rs.getBoolean("versioning"), info));
ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info));
ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info));
ds.setPidsystems(field(rs.getString("pidsystems"), info));
ds.setCertificates(field(rs.getString("certificates"), info));
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
ds.setDataInfo(info);
ds.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("datasourceid");
// rs.getArray("identities");
// rs.getString("officialname");
// rs.getString("englishname");
// rs.getString("contactemail");
// rs.getString("openairecompatibility"); // COMPLEX ...@@@...
// rs.getString("websiteurl");
// rs.getString("logourl");
// rs.getArray("accessinfopackage");
// rs.getDouble("latitude");
// rs.getDouble("longitude");
// rs.getString("namespaceprefix");
// rs.getInt("odnumberofitems"); // NULL
// rs.getDate("odnumberofitemsdate"); // NULL
// rs.getArray("subjects");
// rs.getString("description");
// rs.getString("odpolicies"); // NULL
// rs.getArray("odlanguages");
// rs.getArray("odcontenttypes");
// rs.getBoolean("inferred"); // false
// rs.getBoolean("deletedbyinference");// false
// rs.getDouble("trust"); // 0.9
// rs.getString("inferenceprovenance"); // NULL
// rs.getDate("dateofcollection");
// rs.getDate("dateofvalidation");
// rs.getDate("releasestartdate");
// rs.getDate("releaseenddate");
// rs.getString("missionstatementurl");
// rs.getBoolean("dataprovider");
// rs.getBoolean("serviceprovider");
// rs.getString("databaseaccesstype");
// rs.getString("datauploadtype");
// rs.getString("databaseaccessrestriction");
// rs.getString("datauploadrestriction");
// rs.getBoolean("versioning");
// rs.getString("citationguidelineurl");
// rs.getString("qualitymanagementkind");
// rs.getString("pidsystems");
// rs.getString("certificates");
// rs.getArray("policies");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("datasourcetype"); // COMPLEX
// rs.getString("provenanceaction"); //
// 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions'
// AS provenanceaction,
// rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
emitOaf(ds);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public void processProject(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Project p = new Project();
p.setId(createOpenaireId(40, rs.getString("projectid")));
p.setOriginalId(Arrays.asList(rs.getString("projectid")));
p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
p.setPid(new ArrayList<>());
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
p.setExtraInfo(new ArrayList<>()); // Values not present in the DB
p.setOaiprovenance(null); // Values not present in the DB
p.setWebsiteurl(field(rs.getString("websiteurl"), info));
p.setCode(field(rs.getString("code"), info));
p.setAcronym(field(rs.getString("acronym"), info));
p.setTitle(field(rs.getString("title"), info));
p.setStartdate(field(asString(rs.getDate("startdate")), info));
p.setEnddate(field(asString(rs.getDate("enddate")), info));
p.setCallidentifier(field(rs.getString("callidentifier"), info));
p.setKeywords(field(rs.getString("keywords"), info));
p.setDuration(field(Integer.toString(rs.getInt("duration")), info));
p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info));
p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info));
p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info));
p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info));
p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype")));
p.setOptional1(field(rs.getString("optional1"), info));
p.setOptional2(field(rs.getString("optional2"), info));
p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info));
p.setContactfullname(field(rs.getString("contactfullname"), info));
p.setContactfax(field(rs.getString("contactfax"), info));
p.setContactphone(field(rs.getString("contactphone"), info));
p.setContactemail(field(rs.getString("contactemail"), info));
p.setSummary(field(rs.getString("summary"), info));
p.setCurrency(field(rs.getString("currency"), info));
p.setTotalcost(new Float(rs.getDouble("totalcost")));
p.setFundedamount(new Float(rs.getDouble("fundedamount")));
p.setDataInfo(info);
p.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("projectid");
// rs.getString("code");
// rs.getString("websiteurl");
// rs.getString("acronym");
// rs.getString("title");
// rs.getDate("startdate");
// rs.getDate("enddate");
// rs.getString("callidentifier");
// rs.getString("keywords");
// rs.getInt("duration");
// rs.getBoolean("ecsc39");
// rs.getBoolean("oamandatepublications");
// rs.getBoolean("ecarticle29_3");
// rs.getDate("dateofcollection");
// rs.getDate("dateoftransformation");
// rs.getBoolean("inferred");
// rs.getBoolean("deletedbyinference");
// rs.getDouble("trust");
// rs.getString("inferenceprovenance");
// rs.getString("optional1");
// rs.getString("optional2");
// rs.getString("jsonextrainfo");
// rs.getString("contactfullname");
// rs.getString("contactfax");
// rs.getString("contactphone");
// rs.getString("contactemail");
// rs.getString("summary");
// rs.getString("currency");
// rs.getDouble("totalcost");
// rs.getDouble("fundedamount");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("contracttype"); // COMPLEX
// rs.getString("provenanceaction"); // COMPLEX
// rs.getArray("pid");
// rs.getArray("subjects");
// rs.getArray("fundingtree");
emitOaf(p);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public void processOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final Organization o = new Organization();
o.setId(createOpenaireId(20, rs.getString("organizationid")));
o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
o.setPid(new ArrayList<>());
o.setDateofcollection(asString(rs.getDate("dateofcollection")));
o.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
o.setExtraInfo(new ArrayList<>()); // Values not present in the DB
o.setOaiprovenance(null); // Values not present in the DB
o.setLegalshortname(field("legalshortname", info));
o.setLegalname(field("legalname", info));
o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query
o.setWebsiteurl(field("websiteurl", info));
o.setLogourl(field("logourl", info));
o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info));
o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info));
o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info));
o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info));
o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info));
o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info));
o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info));
o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info));
o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
o.setCountry(prepareQualifierSplitting(rs.getString("country")));
o.setDataInfo(info);
o.setLastupdatetimestamp(lastUpdateTimestamp);
// rs.getString("organizationid");
// rs.getString("legalshortname");
// rs.getString("legalname");
// rs.getString("websiteurl");
// rs.getString("logourl");
// rs.getBoolean("eclegalbody");
// rs.getBoolean("eclegalperson");
// rs.getBoolean("ecnonprofit");
// rs.getBoolean("ecresearchorganization");
// rs.getBoolean("echighereducation");
// rs.getBoolean("ecinternationalorganizationeurinterests");
// rs.getBoolean("ecinternationalorganization");
// rs.getBoolean("ecenterprise");
// rs.getBoolean("ecsmevalidated");
// rs.getBoolean("ecnutscode");
// rs.getDate("dateofcollection");
// rs.getDate("dateoftransformation");
// rs.getBoolean("inferred");
// rs.getBoolean("deletedbyinference");
// rs.getDouble("trust");
// rs.getString("inferenceprovenance");
// rs.getString("collectedfromid");
// rs.getString("collectedfromname");
// rs.getString("country");
// rs.getString("provenanceaction");
// rs.getArray("pid");
emitOaf(o);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public void processDatasourceOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("organization"));
final String dsId = createOpenaireId(10, rs.getString("datasource"));
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType("datasourceOrganization");
r1.setSubRelType("provision");
r1.setRelClass("isProvidedBy");
r1.setSource(dsId);
r1.setTarget(orgId);
r1.setCollectedFrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r1);
final Relation r2 = new Relation();
r2.setRelType("datasourceOrganization");
r2.setSubRelType("provision");
r2.setRelClass("provides");
r2.setSource(orgId);
r2.setTarget(dsId);
r2.setCollectedFrom(collectedFrom);
r2.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r2);
// rs.getString("datasource");
// rs.getString("organization");
// rs.getDate("startdate"); // NULL
// rs.getDate("enddate"); // NULL
// rs.getBoolean("inferred"); // false
// rs.getBoolean("deletedbyinference"); // false
// rs.getDouble("trust"); // 0.9
// rs.getString("inferenceprovenance"); // NULL
// rs.getString("semantics"); // 'providedBy@@@provided
// by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS
// semantics,
// rs.getString("provenanceaction"); // d.provenanceaction || '@@@' || d.provenanceaction ||
// '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
public void processProjectOrganization(final ResultSet rs) {
try {
final DataInfo info = prepareDataInfo(rs);
final String orgId = createOpenaireId(20, rs.getString("resporganization"));
final String projectId = createOpenaireId(40, rs.getString("project"));
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType("projectOrganization");
r1.setSubRelType("participation");
r1.setRelClass("isParticipant");
r1.setSource(projectId);
r1.setTarget(orgId);
r1.setCollectedFrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r1);
final Relation r2 = new Relation();
r2.setRelType("projectOrganization");
r2.setSubRelType("participation");
r2.setRelClass("hasParticipant");
r2.setSource(orgId);
r2.setTarget(projectId);
r2.setCollectedFrom(collectedFrom);
r2.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
emitOaf(r2);
// rs.getString("project");
// rs.getString("resporganization");
// rs.getInt("participantnumber");
// rs.getDouble("contribution");
// rs.getDate("startdate");// null
// rs.getDate("enddate");// null
// rs.getBoolean("inferred");// false
// rs.getBoolean("deletedbyinference"); // false
// rs.getDouble("trust");
// rs.getString("inferenceprovenance"); // NULL
// rs.getString("semantics"); // po.semanticclass || '@@@' || po.semanticclass ||
// '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
// rs.getString("provenanceaction"); //
// 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions'
// AS provenanceaction
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException {
final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
final String inferenceprovenance = rs.getString("inferenceprovenance");
final Boolean inferred = rs.getBoolean("inferred");
final String trust = rs.getString("trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
}
private Qualifier prepareQualifierSplitting(final String s) {
if (StringUtils.isBlank(s)) { return null; }
final String[] arr = s.split("@@@");
return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null;
}
private List<Field<String>> prepareListFields(final Array array, final DataInfo info) {
try {
return listFields(info, (String[]) array.getArray());
} catch (final SQLException e) {
throw new RuntimeException("Invalid SQL array", e);
}
}
private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) {
if (StringUtils.isBlank(s)) { return null; }
final String[] parts = s.split("###");
if (parts.length == 2) {
final String value = parts[0];
final String[] arr = parts[1].split("@@@");
if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); }
}
return null;
}
private List<StructuredProperty> prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException {
final List<StructuredProperty> res = new ArrayList<>();
if (array != null) {
for (final String s : (String[]) array.getArray()) {
final StructuredProperty sp = prepareStructProp(s, dataInfo);
if (sp != null) {
res.add(sp);
}
}
}
return res;
}
private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
if (StringUtils.isNotBlank(sj)) {
final String[] arr = sj.split("@@@");
if (arr.length == 3) {
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null;
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;;
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
if (issn != null || eissn != null
|| lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); }
}
}
return null;
}
@Override
public void close() throws IOException {
super.close();
dbClient.close();
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.migration;
import org.apache.commons.io.IOUtils;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class MigrateMongoMdstoresApplication {
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
parser.parseArgument(args);
final String mongoBaseUrl = parser.get("mongoBaseUrl");
final String mongoDb = parser.get("mongoDb");
final String mdFormat = parser.get("mdFormat");
final String mdLayout = parser.get("mdLayout");
final String mdInterpretation = parser.get("mdInterpretation");
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("namenode");
final String hdfsUser = parser.get("hdfsUser");
final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword");
if (mdFormat.equalsIgnoreCase("oaf")) {
try (final OafMigrationExecutor mig =
new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
}
} else if (mdFormat.equalsIgnoreCase("odf")) {
try (final OdfMigrationExecutor mig =
new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
}
} else {
throw new RuntimeException("Format not supported: " + mdFormat);
}
}
}

View File

@ -0,0 +1,251 @@
package eu.dnetlib.dhp.migration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.migration.pace.PacePerson;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OafMigrationExecutor extends AbstractMongoExecutor {
private static final Log log = LogFactory.getLog(OafMigrationExecutor.class);
public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
}
@Override
protected void registerNamespaces(final Map<String, String> nsContext) {
super.registerNamespaces(nsContext);
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
}
@Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>();
int pos = 1;
for (final Object o : doc.selectNodes("//dc:creator")) {
final Node n = (Node) o;
final Author author = new Author();
author.setFullname(n.getText());
author.setRank(pos++);
final PacePerson p = new PacePerson(n.getText(), false);
if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname());
}
res.add(author);
}
return res;
}
@Override
protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info);
}
@Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
}
@Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description", info);
}
@Override
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:publisher", info);
}
@Override
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:format", info);
}
@Override
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor", info);
}
@Override
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:coverage", info);
}
@Override
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
final List<Instance> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:identifier")) {
final String url = ((Node) o).getText().trim();
if (url.startsWith("http")) {
final Instance instance = new Instance();
instance.setUrl(Arrays.asList(url));
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
res.add(instance);
}
}
return res;
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:source", info);
}
@Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
// SOFTWARES
@Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
// DATASETS
@Override
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
@Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
// OTHER PRODUCTS
@Override
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
return new ArrayList<>(); // NOT PRESENT IN OAF
}
@Override
protected List<Oaf> addOtherResultRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText());
final Relation r1 = new Relation();
r1.setRelType("resultResult");
r1.setSubRelType("publicationDataset");
r1.setRelClass("isRelatedTo");
r1.setSource(docId);
r1.setTarget(otherId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
final Relation r2 = new Relation();
r2.setRelType("resultResult");
r2.setSubRelType("publicationDataset");
r2.setRelClass("isRelatedTo");
r2.setSource(otherId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
}
return res;
}
@Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
}
}

View File

@ -0,0 +1,273 @@
package eu.dnetlib.dhp.migration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.Node;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OdfMigrationExecutor extends AbstractMongoExecutor {
private static final Log log = LogFactory.getLog(OdfMigrationExecutor.class);
public OdfMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
final String dbUrl, final String dbUser,
final String dbPassword) throws Exception {
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
}
@Override
protected void registerNamespaces(final Map<String, String> nsContext) {
super.registerNamespaces(nsContext);
nsContext.put("dc", "http://datacite.org/schema/kernel-3");
}
@Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
}
@Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>();
int pos = 1;
for (final Object o : doc.selectNodes("//dc:creator")) {
final Node n = (Node) o;
final Author author = new Author();
author.setFullname(n.valueOf("./dc:creatorName"));
author.setName(n.valueOf("./dc:givenName"));
author.setSurname(n.valueOf("./dc:familyName"));
author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info));
author.setPid(preparePids(doc, info));
author.setRank(pos++);
res.add(author);
}
return res;
}
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes("./dc:nameIdentifier")) {
res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
}
return res;
}
@Override
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
final List<Instance> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) {
final Instance instance = new Instance();
instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
res.add(instance);
}
return res;
}
@Override
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:date")) {
final String dateType = ((Node) o).valueOf("@dateType");
if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info));
}
}
return res;
}
@Override
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributorName", info);
}
@Override
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:format", info);
}
@Override
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:publisher", info);
}
@Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info);
}
@Override
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//dc:subject", info);
}
@Override
protected Qualifier prepareLanguages(final Document doc) {
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
}
@Override
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info);
}
@Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info);
}
@Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages");
}
@Override
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
return new ArrayList<>(); // Not present in ODF ???
}
@Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
}
// DATASETS
@Override
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
final List<GeoLocation> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//dc:geoLocation")) {
final GeoLocation loc = new GeoLocation();
loc.setBox(((Node) o).valueOf("./dc:geoLocationBox"));
loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace"));
loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint"));
res.add(loc);
}
return res;
}
@Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:date[@dateType='Updated']", info);
}
@Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:version", info);
}
@Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:size", info);
}
@Override
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
return null; // Not present in ODF ???
}
@Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
return prepareField(doc, "//dc:date[@dateType='Issued']", info);
}
@Override
protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText());
final String type = ((Node) o).valueOf("@relationType");
if (type.equals("IsSupplementTo")) {
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo"));
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy"));
} else if (type.equals("IsPartOf")) {
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf"));
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts"));
} else {}
}
return res;
}
private Relation prepareOtherResultRel(final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp,
final String source,
final String target,
final String subRelType,
final String relClass) {
final Relation r = new Relation();
r.setRelType("resultResult");
r.setSubRelType(subRelType);
r.setRelClass(relClass);
r.setSource(source);
r.setTarget(target);
r.setCollectedFrom(Arrays.asList(collectedFrom));
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
return r;
}
@Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource");
}
}

View File

@ -0,0 +1,176 @@
package eu.dnetlib.dhp.migration.pace;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
public class PacePerson {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private final String original;
private static Set<String> particles = null;
public static final String capitalize(final String s) {
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
}
public static final String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s;
}
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
public PacePerson(String s, final boolean aggressive) {
original = s;
s = Normalizer.normalize(s, Normalizer.Form.NFD);
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
s = s.replaceAll("\\{.+\\}", "");
s = s.replaceAll("\\s+-\\s+", "-");
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
s = s.replaceAll("\\d", " ");
s = s.replaceAll("\\n", " ");
s = s.replaceAll("\\.", " ");
s = s.replaceAll("\\s+", " ");
if (aggressive) {
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);
} else if (arr.length > 1) {
surname = splitTerms(arr[0]);
name = splitTerms(arr[1]);
fullname.addAll(surname);
fullname.addAll(name);
}
} else {
fullname = splitTerms(s);
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
}
}
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
name = fullname.subList(0, lastInitialPosition + 1);
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
for (final String term : fullname) {
if (term.length() > 1 && term.equals(term.toUpperCase())) {
surname.add(term);
} else {
name.add(term);
}
}
}
}
}
private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) {
list.add(part);
}
}
return list;
}
public List<String> getName() {
return name;
}
public String getNameString() {
return Joiner.on(" ").join(getName());
}
public List<String> getSurname() {
return surname;
}
public List<String> getFullname() {
return fullname;
}
public String getOriginal() {
return original;
}
public String hash() {
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
}
public String getNormalisedFirstName() {
return Joiner.on(" ").join(getCapitalFirstnames());
}
public String getNormalisedSurname() {
return Joiner.on(" ").join(getCapitalSurname());
}
public String getSurnameString() {
return Joiner.on(" ").join(getSurname());
}
public String getNormalisedFullname() {
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
}
public List<String> getCapitalFirstnames() {
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
}
public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
}
public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
}
public boolean isAccurate() {
return name != null && surname != null && !name.isEmpty() && !surname.isEmpty();
}
}

View File

@ -0,0 +1,26 @@
[
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the HDFS source path which contains the sequential file",
"paramRequired": true
},
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "g",
"paramLongName": "graphRawPath",
"paramDescription": "the path of the graph Raw in hdfs",
"paramRequired": true
},
{
"paramName": "e",
"paramLongName": "entity",
"paramDescription": "The entity to extract",
"paramRequired": true
}
]

View File

@ -0,0 +1,38 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "n",
"paramLongName": "namenode",
"paramDescription": "the Name Node URI",
"paramRequired": true
},
{
"paramName": "u",
"paramLongName": "hdfsUser",
"paramDescription": "the user wich create the hdfs seq file",
"paramRequired": true
},
{
"paramName": "dburl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "dbuser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "dbpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

View File

@ -0,0 +1,68 @@
[
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "n",
"paramLongName": "namenode",
"paramDescription": "the Name Node URI",
"paramRequired": true
},
{
"paramName": "u",
"paramLongName": "hdfsUser",
"paramDescription": "the user wich create the hdfs seq file",
"paramRequired": true
},
{
"paramName": "mongourl",
"paramLongName": "mongoBaseUrl",
"paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]",
"paramRequired": true
},
{
"paramName": "db",
"paramLongName": "mongoDb",
"paramDescription": "mongo database",
"paramRequired": true
},
{
"paramName": "f",
"paramLongName": "mdFormat",
"paramDescription": "metadata format",
"paramRequired": true
},
{
"paramName": "l",
"paramLongName": "mdLayout",
"paramDescription": "metadata layout",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "mdInterpretation",
"paramDescription": "metadata interpretation",
"paramRequired": true
},
{
"paramName": "pgurl",
"paramLongName": "postgresUrl",
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
"paramRequired": true
},
{
"paramName": "pguser",
"paramLongName": "postgresUser",
"paramDescription": "postgres user",
"paramRequired": false
},
{
"paramName": "pgpasswd",
"paramLongName": "postgresPassword",
"paramDescription": "postgres password",
"paramRequired": false
}
]

View File

@ -0,0 +1,22 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hdfsUser</name>
<value>dnet</value>
</property>
</configuration>

View File

@ -0,0 +1,282 @@
<workflow-app name="import Entities from aggretor to HDFS" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the base path to store hdfs file</description>
</property>
<property>
<name>graphRawPath</name>
<description>the graph Raw base path</description>
</property>
<property>
<name>postgresURL</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>postgresUser</name>
<description>the user postgres</description>
</property>
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>mongourl</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>mongoDb</name>
<description>mongo database</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}'/>
<mkdir path='${workingPath}'/>
</fs>
<ok to="ImportEntitiesFromPostgres"/>
<error to="Kill"/>
</action>
<action name="ImportEntitiesFromPostgres">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication</main-class>
<arg>-p</arg><arg>${workingPath}/db_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-dburl</arg><arg>${postgresURL}</arg>
<arg>-dbuser</arg><arg>${postgresUser}</arg>
<arg>-dbpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportODFEntitiesFromMongoDB"/>
<error to="Kill"/>
</action>
<action name="ImportODFEntitiesFromMongoDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/odf_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-mongourl</arg><arg>${mongourl}</arg>
<arg>-db</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>ODF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ImportOAFEntitiesFromMongoDB"/>
<error to="Kill"/>
</action>
<action name="ImportOAFEntitiesFromMongoDB">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
<arg>-p</arg><arg>${workingPath}/oaf_entities</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${hdfsUser}</arg>
<arg>-mongourl</arg><arg>${mongourl}</arg>
<arg>-db</arg><arg>${mongoDb}</arg>
<arg>-f</arg><arg>OAF</arg>
<arg>-l</arg><arg>store</arg>
<arg>-i</arg><arg>cleaned</arg>
<arg>-pgurl</arg><arg>${postgresURL}</arg>
<arg>-pguser</arg><arg>${postgresUser}</arg>
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
</java>
<ok to="ExtractPublication"/>
<error to="Kill"/>
</action>
<action name="ExtractPublication">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: publication</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/publication</arg>
<arg>-e</arg><arg>publication</arg>
</spark>
<ok to="ExtractDataset"/>
<error to="Kill"/>
</action>
<action name="ExtractDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: dataset</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/dataset</arg>
<arg>-e</arg><arg>dataset</arg>
</spark>
<ok to="ExtractSoftware"/>
<error to="Kill"/>
</action>
<action name="ExtractSoftware">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: software</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/software</arg>
<arg>-e</arg><arg>software</arg>
</spark>
<ok to="ExtractORP"/>
<error to="Kill"/>
</action>
<action name="ExtractORP">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: otherresearchproduct</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/otherresearchproduct</arg>
<arg>-e</arg><arg>otherresearchproduct</arg>
</spark>
<ok to="ExtractDatasource"/>
<error to="Kill"/>
</action>
<action name="ExtractDatasource">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: datasource</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/datasource</arg>
<arg>-e</arg><arg>datasource</arg>
</spark>
<ok to="ExtractOrganization"/>
<error to="Kill"/>
</action>
<action name="ExtractOrganization">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: organization</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/organization</arg>
<arg>-e</arg><arg>organization</arg>
</spark>
<ok to="ExtractProject"/>
<error to="Kill"/>
</action>
<action name="ExtractProject">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: project</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/project</arg>
<arg>-e</arg><arg>project</arg>
</spark>
<ok to="ExtractRelation"/>
<error to="Kill"/>
</action>
<action name="ExtractRelation">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractEntities: relation</name>
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>-g</arg><arg>${graphRawPath}/relation</arg>
<arg>-e</arg><arg>relation</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,7 @@
van
der
de
dell
sig
mr
mrs

View File

@ -0,0 +1,17 @@
SELECT
dor.datasource AS datasource,
dor.organization AS organization,
NULL AS startdate,
NULL AS enddate,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics,
d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
FROM dsm_datasource_organization dor
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom)

View File

@ -0,0 +1,147 @@
SELECT
d.id AS datasourceid,
d.id || array_agg(distinct di.pid) AS identities,
d.officialname AS officialname,
d.englishname AS englishname,
d.contactemail AS contactemail,
CASE
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1'])
THEN
'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0'])
THEN
'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver'])
THEN
'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0'])
THEN
'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0'])
THEN
'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data'])
THEN
'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native'])
THEN
'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy'])
THEN
'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible'])
THEN
'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
ELSE
'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
END AS openairecompatibility,
d.websiteurl AS websiteurl,
d.logourl AS logourl,
array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage,
d.latitude AS latitude,
d.longitude AS longitude,
d.namespaceprefix AS namespaceprefix,
NULL AS odnumberofitems,
NULL AS odnumberofitemsdate,
(SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies')
FROM UNNEST(
ARRAY(
SELECT trim(s)
FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects,
d.description AS description,
NULL AS odpolicies,
ARRAY(SELECT trim(s)
FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages,
ARRAY(SELECT trim(s)
FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
false AS inferred,
false AS deletedbyinference,
0.9 AS trust,
NULL AS inferenceprovenance,
d.dateofcollection AS dateofcollection,
d.dateofvalidation AS dateofvalidation,
-- re3data fields
d.releasestartdate AS releasestartdate,
d.releaseenddate AS releaseenddate,
d.missionstatementurl AS missionstatementurl,
d.dataprovider AS dataprovider,
d.serviceprovider AS serviceprovider,
d.databaseaccesstype AS databaseaccesstype,
d.datauploadtype AS datauploadtype,
d.databaseaccessrestriction AS databaseaccessrestriction,
d.datauploadrestriction AS datauploadrestriction,
d.versioning AS versioning,
d.citationguidelineurl AS citationguidelineurl,
d.qualitymanagementkind AS qualitymanagementkind,
d.pidsystems AS pidsystems,
d.certificates AS certificates,
ARRAY[]::text[] AS policies,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
d.typology || '@@@' || CASE
WHEN (d.typology = 'crissystem') THEN 'CRIS System'
WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository'
WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator'
WHEN (d.typology = 'infospace') THEN 'Information Space'
WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository'
WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator'
WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal'
WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher'
WHEN (d.typology = 'pubsrepository::mock') THEN 'Other'
WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue'
WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository'
WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator'
WHEN (d.typology = 'entityregistry') THEN 'Registry'
WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure'
WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository'
WHEN (d.typology = 'websource') THEN 'Web Source'
WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database'
WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories'
WHEN (d.typology = 'softwarerepository') THEN 'Software Repository'
WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator'
WHEN (d.typology = 'orprepository') THEN 'Repository'
ELSE 'Other'
END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
FROM dsm_datasources d
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
GROUP BY
d.id,
d.officialname,
d.englishname,
d.websiteurl,
d.logourl,
d.contactemail,
d.namespaceprefix,
d.description,
d.latitude,
d.longitude,
d.dateofcollection,
d.dateofvalidation,
d.releasestartdate,
d.releaseenddate,
d.missionstatementurl,
d.dataprovider,
d.serviceprovider,
d.databaseaccesstype,
d.datauploadtype,
d.databaseaccessrestriction,
d.datauploadrestriction,
d.versioning,
d.citationguidelineurl,
d.qualitymanagementkind,
d.pidsystems,
d.certificates,
dc.id,
dc.officialname,
d.issn,
d.eissn,
d.lissn

View File

@ -0,0 +1,36 @@
SELECT
o.id AS organizationid,
o.legalshortname AS legalshortname,
o.legalname AS legalname,
o.websiteurl AS websiteurl,
o.logourl AS logourl,
o.ec_legalbody AS eclegalbody,
o.ec_legalperson AS eclegalperson,
o.ec_nonprofit AS ecnonprofit,
o.ec_researchorganization AS ecresearchorganization,
o.ec_highereducation AS echighereducation,
o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests,
o.ec_internationalorganization AS ecinternationalorganization,
o.ec_enterprise AS ecenterprise,
o.ec_smevalidated AS ecsmevalidated,
o.ec_nutscode AS ecnutscode,
o.dateofcollection AS dateofcollection,
o.lastupdate AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
o.trust AS trust,
'' AS inferenceprovenance,
d.id AS collectedfromid,
d.officialname AS collectedfromname,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
ARRAY[]::text[] AS pid
FROM dsm_organizations o
LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)

View File

@ -0,0 +1,53 @@
SELECT
o.id AS organizationid,
coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname,
o.name AS legalname,
array_agg(DISTINCT n.name) AS "alternativeNames",
(array_agg(u.url))[1] AS websiteurl,
o.modification_date AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
0.95 AS trust,
'' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM organizations o
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
LEFT OUTER JOIN urls u ON (u.id = o.id)
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
LEFT OUTER JOIN other_names n ON (n.id = o.id)
GROUP BY
o.id,
o.name,
o.modification_date,
o.country
UNION ALL
SELECT
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid,
n.name AS legalshortname,
n.name AS legalname,
ARRAY[]::text[] AS "alternativeNames",
(array_agg(u.url))[1] AS websiteurl,
o.modification_date AS dateoftransformation,
false AS inferred,
false AS deletedbyinference,
0.88 AS trust,
'' AS inferenceprovenance,
'openaire____::openorgs' AS collectedfromid,
'OpenOrgs Database' AS collectedfromname,
o.country || '@@@dnet:countries' AS country,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
FROM other_names n
LEFT OUTER JOIN organizations o ON (n.id = o.id)
LEFT OUTER JOIN urls u ON (u.id = o.id)
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
GROUP BY
o.id, o.modification_date, o.country, n.name

View File

@ -0,0 +1,19 @@
SELECT
po.project AS project,
po.resporganization AS resporganization,
po.participantnumber AS participantnumber,
po.contribution AS contribution,
NULL AS startdate,
NULL AS enddate,
false AS inferred,
false AS deletedbyinference,
po.trust AS trust,
NULL AS inferenceprovenance,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction
FROM project_organization po
LEFT OUTER JOIN projects p ON (p.id = po.project)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)

View File

@ -0,0 +1,90 @@
SELECT
p.id AS projectid,
p.code AS code,
p.websiteurl AS websiteurl,
p.acronym AS acronym,
p.title AS title,
p.startdate AS startdate,
p.enddate AS enddate,
p.call_identifier AS callidentifier,
p.keywords AS keywords,
p.duration AS duration,
p.ec_sc39 AS ecsc39,
p.oa_mandate_for_publications AS oamandatepublications,
p.ec_article29_3 AS ecarticle29_3,
p.dateofcollection AS dateofcollection,
p.lastupdate AS dateoftransformation,
p.inferred AS inferred,
p.deletedbyinference AS deletedbyinference,
p.trust AS trust,
p.inferenceprovenance AS inferenceprovenance,
p.optional1 AS optional1,
p.optional2 AS optional2,
p.jsonextrainfo AS jsonextrainfo,
p.contactfullname AS contactfullname,
p.contactfax AS contactfax,
p.contactphone AS contactphone,
p.contactemail AS contactemail,
p.summary AS summary,
p.currency AS currency,
p.totalcost AS totalcost,
p.fundedamount AS fundedamount,
dc.id AS collectedfromid,
dc.officialname AS collectedfromname,
ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype,
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
array_agg(DISTINCT fp.path) AS fundingtree
FROM projects p
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass)
LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme)
GROUP BY
p.id,
p.code,
p.websiteurl,
p.acronym,
p.title,
p.startdate,
p.enddate,
p.call_identifier,
p.keywords,
p.duration,
p.ec_sc39,
p.oa_mandate_for_publications,
p.ec_article29_3,
p.dateofcollection,
p.inferred,
p.deletedbyinference,
p.trust,
p.inferenceprovenance,
p.contactfullname,
p.contactfax,
p.contactphone,
p.contactemail,
p.summary,
p.currency,
p.totalcost,
p.fundedamount,
dc.id,
dc.officialname,
pac.code, pac.name, pas.code, pas.name,
ctc.code, ctc.name, cts.code, cts.name;

View File

@ -0,0 +1,17 @@
SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar'
UNION ALL
SELECT
o.id AS id1,
'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2
FROM acronyms a
LEFT OUTER JOIN organizations o ON (a.id = o.id)
UNION ALL
SELECT
o.id AS id1,
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2
FROM other_names n
LEFT OUTER JOIN organizations o ON (n.id = o.id)

View File

@ -0,0 +1,9 @@
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

View File

@ -1,15 +1,45 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>

File diff suppressed because one or more lines are too long

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -1,10 +1,18 @@
package eu.dnetlib.dhp.graph;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.Map;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
public class GraphMappingUtils {
public final static Map<String, Class> types = Maps.newHashMap();

View File

@ -4,12 +4,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
public class SparkGraphImporterJob {
@ -33,13 +31,8 @@ public class SparkGraphImporterJob {
// Read the input file and convert it into RDD of serializable object
GraphMappingUtils.types.forEach((name, clazz) -> {
final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class)
.map(item -> new Tuple2<>(item._1.toString(), item._2.toString()));
spark.createDataset(inputRDD
.filter(s -> s._1().equals(clazz.getName()))
.map(Tuple2::_2)
.map(s -> new ObjectMapper().readValue(s, clazz))
spark.createDataset(sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class)
.map(s -> new ObjectMapper().readValue(s._2().toString(), clazz))
.rdd(), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)

View File

@ -0,0 +1,10 @@
sparkDriverMemory=8G
sparkExecutorMemory=8G
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
outputPath=/tmp/openaire_provision
format=TMF
batchSize=2000
sparkExecutorCoresForIndexing=64
reuseRecords=true

View File

@ -0,0 +1,92 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-graph-provision</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>com.mycila.xmltool</groupId>
<artifactId>xmltool</artifactId>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
</dependency>
<dependency>
<groupId>com.lucidworks.spark</groupId>
<artifactId>spark-solr</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</dependency>
<dependency>
<groupId>org.noggit</groupId>
<artifactId>noggit</artifactId>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</dependency>
<dependency>
<groupId>org.apache.cxf</groupId>
<artifactId>cxf-rt-transports-http</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,257 @@
package eu.dnetlib.dhp.graph;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.graph.model.*;
import eu.dnetlib.dhp.graph.utils.ContextMapper;
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
import eu.dnetlib.dhp.graph.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
/**
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
*
* The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again
* by E, finally grouped by E.id;
*
* Different manipulations of the E and R sets are introduced to reduce the complexity of the operation
* 1) treat the object payload as string, extracting only the necessary information beforehand using json path,
* it seems that deserializing it with jackson's object mapper has higher memory footprint.
*
* 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false)
* 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
* and E_target = T. Objects in T are heavily pruned by all the unnecessary information
*
* 4) perform the join as (((T join R) union S) groupby S.id) yield S -> [ <T, R> ]
*/
public class GraphJoiner implements Serializable {
public static final int MAX_RELS = 100;
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
private SparkSession spark;
private ContextMapper contextMapper;
private String inputPath;
private String outPath;
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String inputPath, String outPath) {
this.spark = spark;
this.contextMapper = contextMapper;
this.inputPath = inputPath;
this.outPath = outPath;
}
public GraphJoiner adjacencyLists() {
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
// read each entity
JavaPairRDD<String, TypedRow> datasource = readPathEntity(sc, getInputPath(), "datasource");
JavaPairRDD<String, TypedRow> organization = readPathEntity(sc, getInputPath(), "organization");
JavaPairRDD<String, TypedRow> project = readPathEntity(sc, getInputPath(), "project");
JavaPairRDD<String, TypedRow> dataset = readPathEntity(sc, getInputPath(), "dataset");
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(sc, getInputPath(), "otherresearchproduct");
JavaPairRDD<String, TypedRow> software = readPathEntity(sc, getInputPath(), "software");
JavaPairRDD<String, TypedRow> publication = readPathEntity(sc, getInputPath(), "publication");
// create the union between all the entities
final String entitiesPath = getOutPath() + "/entities";
datasource
.union(organization)
.union(project)
.union(dataset)
.union(otherresearchproduct)
.union(software)
.union(publication)
.map(e -> new EntityRelEntity().setSource(e._2()))
.map(GraphMappingUtils::serialize)
.saveAsTextFile(entitiesPath, GzipCodec.class);
JavaPairRDD<String, EntityRelEntity> entities = sc.textFile(entitiesPath)
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
.mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
// reads the relationships
final JavaPairRDD<String, EntityRelEntity> relation = readPathRelation(sc, getInputPath())
.filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted
.map(p -> new EntityRelEntity().setRelation(p))
.mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p))
.groupByKey()
.map(p -> Iterables.limit(p._2(), MAX_RELS))
.flatMap(p -> p.iterator())
.mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
//final String bySource = getOutPath() + "/1_join_by_target";
JavaPairRDD<String, EntityRelEntity> bySource = relation
.join(entities
.filter(e -> !e._2().getSource().getDeleted())
.mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
.map(s -> new EntityRelEntity()
.setRelation(s._2()._1().getRelation())
.setTarget(s._2()._2().getSource()))
.mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, false, schemaLocation, new HashSet<>());
entities
.union(bySource)
.groupByKey() // by source id
.map(l -> toJoinedEntity(l))
.mapToPair(je -> new Tuple2<>(
new Text(je.getEntity().getId()),
new Text(recordFactory.build(je))))
.saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
return this;
}
public GraphJoiner asXML() {
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, true, "", new HashSet<>());
final ObjectMapper mapper = new ObjectMapper();
final String joinedEntitiesPath = getOutPath() + "/1_joined_entities";
sc.textFile(joinedEntitiesPath)
.map(s -> mapper.readValue(s, JoinedEntity.class))
.mapToPair(je -> new Tuple2<>(new Text(je.getEntity().getId()), new Text(recordFactory.build(je))))
.saveAsHadoopFile(getOutPath() + "/2_xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
return this;
}
public SparkSession getSpark() {
return spark;
}
public String getInputPath() {
return inputPath;
}
public String getOutPath() {
return outPath;
}
// HELPERS
private OafEntity parseOaf(final String json, final String type) {
final ObjectMapper o = new ObjectMapper();
try {
switch (GraphMappingUtils.EntityType.valueOf(type)) {
case publication:
return o.readValue(json, Publication.class);
case dataset:
return o.readValue(json, Dataset.class);
case otherresearchproduct:
return o.readValue(json, OtherResearchProduct.class);
case software:
return o.readValue(json, Software.class);
case datasource:
return o.readValue(json, Datasource.class);
case organization:
return o.readValue(json, Organization.class);
case project:
return o.readValue(json, Project.class);
default:
throw new IllegalArgumentException("invalid type: " + type);
}
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
final ObjectMapper o = new ObjectMapper();
final JoinedEntity j = new JoinedEntity();
final Links links2 = new Links();
for(EntityRelEntity rel : p._2()) {
if (rel.hasMainEntity() & j.getEntity() == null) {
j.setType(rel.getSource().getType());
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType()));
}
if (rel.hasRelatedEntity()) {
try {
links2.add(
new eu.dnetlib.dhp.graph.model.Tuple2()
.setRelation(o.readValue(rel.getRelation().getOaf(), Relation.class))
.setRelatedEntity(o.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}
j.setLinks(links2);
if (j.getEntity() == null) {
throw new IllegalStateException("missing main entity on '" + p._1() + "'");
}
return j;
}
/**
* Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a sequence file <className, entity json serialization>,
* extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
* @param sc
* @param inputPath
* @param type
* @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
*/
private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class)
.mapToPair((PairFunction<Tuple2<Text, Text>, String, TypedRow>) item -> {
final String s = item._2().toString();
final DocumentContext json = JsonPath.parse(s);
final String id = json.read("$.id");
return new Tuple2<>(id, new TypedRow()
.setSourceId(id)
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
.setType(type)
.setOaf(s));
});
}
/**
* Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file <className, relation json serialization>,
* extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
* @param sc
* @param inputPath
* @return the JavaRDD<TypedRow> containing all the relationships
*/
private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) {
return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class)
.map(item -> {
final String s = item._2().toString();
final DocumentContext json = JsonPath.parse(s);
return new TypedRow()
.setSourceId(json.read("$.source"))
.setTargetId(json.read("$.target"))
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
.setType("relation")
.setOaf(s);
});
}
}

View File

@ -0,0 +1,188 @@
package eu.dnetlib.dhp.graph;
import com.lucidworks.spark.util.SolrSupport;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.graph.utils.ISLookupClientFactory;
import eu.dnetlib.dhp.graph.utils.StreamingInputDocumentFactory;
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.solr.common.SolrInputDocument;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.SparkSession;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.text.SimpleDateFormat;
import java.util.Date;
public class SparkXmlIndexingJob {
private static final Log log = LogFactory.getLog(SparkXmlIndexingJob.class);
private static final Integer DEFAULT_BATCH_SIZE = 1000;
private static final String LAYOUT = "index";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_update_index.json")));
parser.parseArgument(args);
final String inputPath = parser.get("sourcePath");
final String isLookupUrl = parser.get("isLookupUrl");
final String format = parser.get("format");
final Integer batchSize = parser.getObjectMap().containsKey("batchSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE;
final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String fields = getLayoutSource(isLookup, format);
final String xslt = getLayoutTransformer(isLookup);
final String dsId = getDsId(format, isLookup);
final String zkHost = getZkHost(isLookup);
final String version = getRecordDatestamp();
final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
log.info("indexRecordTransformer: " + indexRecordXslt);
final String master = parser.get("master");
final SparkConf conf = new SparkConf()
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
try(SparkSession spark = getSession(conf, master)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
RDD<SolrInputDocument> docs = sc.sequenceFile(inputPath, Text.class, Text.class)
.map(t -> t._2().toString())
.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
.rdd();
SolrSupport.indexDocs(zkHost, format + "-" + LAYOUT + "-openaire", batchSize, docs);
}
}
private static SparkSession getSession(SparkConf conf, String master) {
return SparkSession
.builder()
.config(conf)
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
.master(master)
.getOrCreate();
}
private static String toIndexRecord(Transformer tr, final String record) {
final StreamResult res = new StreamResult(new StringWriter());
try {
tr.transform(new StreamSource(new StringReader(record)), res);
return res.getWriter().toString();
} catch (Throwable e) {
System.out.println("XPathException on record:\n" + record);
throw new IllegalArgumentException(e);
}
}
/**
* Creates the XSLT responsible for building the index xml records.
*
* @param format Metadata format name (DMF|TMF)
* @param xslt xslt for building the index record transformer
* @param fields the list of fields
* @return the javax.xml.transform.Transformer
* @throws ISLookUpException could happen
* @throws IOException could happen
* @throws TransformerException could happen
*/
private static String getLayoutTransformer(String format, String fields, String xslt) throws TransformerException {
final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt);
final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter());
layoutTransformer.setParameter("format", format);
layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt);
return layoutToXsltXslt.getWriter().toString();
}
/**
* method return a solr-compatible string representation of a date, used to mark all records as indexed today
* @return the parsed date
*/
public static String getRecordDatestamp() {
return new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss'Z'").format(new Date());
}
/**
* Method retrieves from the information system the list of fields associated to the given MDFormat name
*
* @param isLookup the ISLookup service stub
* @param format the Metadata format name
* @return the string representation of the list of fields to be indexed
*
* @throws ISLookUpDocumentNotFoundException
* @throws ISLookUpException
*/
private static String getLayoutSource(final ISLookUpService isLookup, final String format) throws ISLookUpDocumentNotFoundException, ISLookUpException {
return doLookup(isLookup, String.format(
"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", format, LAYOUT));
}
/**
* Method retrieves from the information system the openaireLayoutToRecordStylesheet
*
* @param isLookup the ISLookup service stub
* @return the string representation of the XSLT contained in the transformation rule profile
*
* @throws ISLookUpDocumentNotFoundException
* @throws ISLookUpException
*/
private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException {
return doLookup(isLookup, "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" +
"//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
}
/**
* Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
* @param format
* @param isLookup
* @return the IndexDS identifier
* @throws ISLookUpException
*/
private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException {
return doLookup(isLookup, String.format("collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" +
"//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", format));
}
/**
* Method retrieves from the information system the zookeeper quorum of the Solr server
* @param isLookup
* @return the zookeeper quorum of the Solr server
* @throws ISLookUpException
*/
private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException {
return doLookup(isLookup, "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
}
private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException {
log.info(String.format("running xquery: %s", xquery));
final String res = isLookup.getResourceProfileByQuery(xquery);
log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
return res;
}
}

View File

@ -0,0 +1,48 @@
package eu.dnetlib.dhp.graph;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.graph.utils.ContextMapper;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
public class SparkXmlRecordBuilderJob {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json")));
parser.parseArgument(args);
final String master = parser.get("master");
final SparkConf conf = new SparkConf()
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
try(SparkSession spark = getSession(conf, master)) {
final String inputPath = parser.get("sourcePath");
final String outputPath = parser.get("outputPath");
final String isLookupUrl = parser.get("isLookupUrl");
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
if (fs.exists(new Path(outputPath))) {
fs.delete(new Path(outputPath), true);
fs.mkdirs(new Path(outputPath));
}
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), inputPath, outputPath)
.adjacencyLists();
}
}
private static SparkSession getSession(SparkConf conf, String master) {
return SparkSession
.builder()
.config(conf)
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
.master(master)
.getOrCreate();
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.graph.model;
import java.io.Serializable;
public class EntityRelEntity implements Serializable {
private TypedRow source;
private TypedRow relation;
private TypedRow target;
public EntityRelEntity() {
}
public EntityRelEntity(TypedRow source) {
this.source = source;
}
//helpers
public Boolean hasMainEntity() {
return getSource() != null & getRelation() == null & getTarget() == null;
}
public Boolean hasRelatedEntity() {
return getSource() == null & getRelation() != null & getTarget() != null;
}
public TypedRow getSource() {
return source;
}
public EntityRelEntity setSource(TypedRow source) {
this.source = source;
return this;
}
public TypedRow getRelation() {
return relation;
}
public EntityRelEntity setRelation(TypedRow relation) {
this.relation = relation;
return this;
}
public TypedRow getTarget() {
return target;
}
public EntityRelEntity setTarget(TypedRow target) {
this.target = target;
return this;
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.graph.model;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import java.io.Serializable;
public class JoinedEntity implements Serializable {
private String type;
private OafEntity entity;
private Links links;
public String getType() {
return type;
}
public JoinedEntity setType(String type) {
this.type = type;
return this;
}
public OafEntity getEntity() {
return entity;
}
public JoinedEntity setEntity(OafEntity entity) {
this.entity = entity;
return this;
}
public Links getLinks() {
return links;
}
public JoinedEntity setLinks(Links links) {
this.links = links;
return this;
}
}

View File

@ -0,0 +1,6 @@
package eu.dnetlib.dhp.graph.model;
import java.util.ArrayList;
public class Links extends ArrayList<Tuple2> {
}

View File

@ -0,0 +1,257 @@
package eu.dnetlib.dhp.graph.model;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
public class RelatedEntity implements Serializable {
private String id;
private String type;
// common fields
private StructuredProperty title;
private String websiteurl; // datasource, organizations, projects
// results
private String dateofacceptance;
private String publisher;
private List<StructuredProperty> pid;
private String codeRepositoryUrl;
private Qualifier resulttype;
private List<KeyValue> collectedfrom;
private List<Instance> instances;
// datasource
private String officialname;
private Qualifier datasourcetype;
private Qualifier datasourcetypeui;
private Qualifier openairecompatibility;
//private String aggregatortype;
// organization
private String legalname;
private String legalshortname;
private Qualifier country;
// project
private String projectTitle;
private String code;
private String acronym;
private Qualifier contracttype;
private List<String> fundingtree;
public String getId() {
return id;
}
public RelatedEntity setId(String id) {
this.id = id;
return this;
}
public StructuredProperty getTitle() {
return title;
}
public RelatedEntity setTitle(StructuredProperty title) {
this.title = title;
return this;
}
public String getDateofacceptance() {
return dateofacceptance;
}
public RelatedEntity setDateofacceptance(String dateofacceptance) {
this.dateofacceptance = dateofacceptance;
return this;
}
public String getPublisher() {
return publisher;
}
public RelatedEntity setPublisher(String publisher) {
this.publisher = publisher;
return this;
}
public List<StructuredProperty> getPid() {
return pid;
}
public RelatedEntity setPid(List<StructuredProperty> pid) {
this.pid = pid;
return this;
}
public String getCodeRepositoryUrl() {
return codeRepositoryUrl;
}
public RelatedEntity setCodeRepositoryUrl(String codeRepositoryUrl) {
this.codeRepositoryUrl = codeRepositoryUrl;
return this;
}
public Qualifier getResulttype() {
return resulttype;
}
public RelatedEntity setResulttype(Qualifier resulttype) {
this.resulttype = resulttype;
return this;
}
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public RelatedEntity setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
return this;
}
public List<Instance> getInstances() {
return instances;
}
public RelatedEntity setInstances(List<Instance> instances) {
this.instances = instances;
return this;
}
public String getOfficialname() {
return officialname;
}
public RelatedEntity setOfficialname(String officialname) {
this.officialname = officialname;
return this;
}
public String getWebsiteurl() {
return websiteurl;
}
public RelatedEntity setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl;
return this;
}
public Qualifier getDatasourcetype() {
return datasourcetype;
}
public RelatedEntity setDatasourcetype(Qualifier datasourcetype) {
this.datasourcetype = datasourcetype;
return this;
}
public Qualifier getDatasourcetypeui() {
return datasourcetypeui;
}
public RelatedEntity setDatasourcetypeui(Qualifier datasourcetypeui) {
this.datasourcetypeui = datasourcetypeui;
return this;
}
public Qualifier getOpenairecompatibility() {
return openairecompatibility;
}
public RelatedEntity setOpenairecompatibility(Qualifier openairecompatibility) {
this.openairecompatibility = openairecompatibility;
return this;
}
public String getLegalname() {
return legalname;
}
public RelatedEntity setLegalname(String legalname) {
this.legalname = legalname;
return this;
}
public String getLegalshortname() {
return legalshortname;
}
public RelatedEntity setLegalshortname(String legalshortname) {
this.legalshortname = legalshortname;
return this;
}
public Qualifier getCountry() {
return country;
}
public RelatedEntity setCountry(Qualifier country) {
this.country = country;
return this;
}
public String getCode() {
return code;
}
public RelatedEntity setCode(String code) {
this.code = code;
return this;
}
public String getAcronym() {
return acronym;
}
public RelatedEntity setAcronym(String acronym) {
this.acronym = acronym;
return this;
}
public Qualifier getContracttype() {
return contracttype;
}
public RelatedEntity setContracttype(Qualifier contracttype) {
this.contracttype = contracttype;
return this;
}
public List<String> getFundingtree() {
return fundingtree;
}
public RelatedEntity setFundingtree(List<String> fundingtree) {
this.fundingtree = fundingtree;
return this;
}
public String getProjectTitle() {
return projectTitle;
}
public RelatedEntity setProjectTitle(String projectTitle) {
this.projectTitle = projectTitle;
return this;
}
public String getType() {
return type;
}
public RelatedEntity setType(String type) {
this.type = type;
return this;
}
}

View File

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.graph.model;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class Tuple2 {
private Relation relation;
private RelatedEntity relatedEntity;
public Relation getRelation() {
return relation;
}
public Tuple2 setRelation(Relation relation) {
this.relation = relation;
return this;
}
public RelatedEntity getRelatedEntity() {
return relatedEntity;
}
public Tuple2 setRelatedEntity(RelatedEntity relatedEntity) {
this.relatedEntity = relatedEntity;
return this;
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.dhp.graph.model;
import java.io.Serializable;
public class TypedRow implements Serializable {
private String sourceId;
private String targetId;
private Boolean deleted;
private String type;
private String oaf;
public String getSourceId() {
return sourceId;
}
public TypedRow setSourceId(String sourceId) {
this.sourceId = sourceId;
return this;
}
public String getTargetId() {
return targetId;
}
public TypedRow setTargetId(String targetId) {
this.targetId = targetId;
return this;
}
public Boolean getDeleted() {
return deleted;
}
public TypedRow setDeleted(Boolean deleted) {
this.deleted = deleted;
return this;
}
public String getType() {
return type;
}
public TypedRow setType(String type) {
this.type = type;
return this;
}
public String getOaf() {
return oaf;
}
public TypedRow setOaf(String oaf) {
this.oaf = oaf;
return this;
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.dhp.graph.utils;
import java.io.Serializable;
public class ContextDef implements Serializable {
private String id;
private String label;
private String name;
private String type;
public ContextDef(final String id, final String label, final String name, final String type) {
super();
this.setId(id);
this.setLabel(label);
this.setName(name);
this.setType(type);
}
public String getLabel() {
return label;
}
public void setLabel(final String label) {
this.label = label;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public String getType() {
return type;
}
public void setType(final String type) {
this.type = type;
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.graph.utils;
import com.google.common.base.Joiner;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import java.io.Serializable;
import java.io.StringReader;
import java.util.HashMap;
public class ContextMapper extends HashMap<String, ContextDef> implements Serializable {
private static final long serialVersionUID = 2159682308502487305L;
private final static String XQUERY = "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return <entry id=\"{$x/@id}\" label=\"{$x/@label|$x/@name}\" name=\"{$x/name()}\" type=\"{$x/@type}\"/>";
public static ContextMapper fromIS(final String isLookupUrl) throws DocumentException, ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
StringBuilder sb = new StringBuilder("<ContextDSResources>");
Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY));
sb.append("</ContextDSResources>");
return fromXml(sb.toString());
}
public static ContextMapper fromXml(final String xml) throws DocumentException {
final ContextMapper contextMapper = new ContextMapper();
final Document doc = new SAXReader().read(new StringReader(xml));
for (Object o : doc.selectNodes("//entry")) {
Node node = (Node) o;
String id = node.valueOf("./@id");
String label = node.valueOf("./@label");
String name = node.valueOf("./@name");
String type = node.valueOf("./@type") + "";
contextMapper.put(id, new ContextDef(id, label, name, type));
}
return contextMapper;
}
}

View File

@ -0,0 +1,254 @@
package eu.dnetlib.dhp.graph.utils;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Predicate;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.graph.model.EntityRelEntity;
import eu.dnetlib.dhp.graph.model.RelatedEntity;
import eu.dnetlib.dhp.graph.model.TypedRow;
import eu.dnetlib.dhp.schema.oaf.*;
import net.minidev.json.JSONArray;
import org.apache.commons.lang3.StringUtils;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import static org.apache.commons.lang3.StringUtils.*;
public class GraphMappingUtils {
public enum EntityType {
publication, dataset, otherresearchproduct, software, datasource, organization, project
}
public enum MainEntityType {
result, datasource, organization, project
}
public static Set<String> authorPidTypes = Sets.newHashSet("orcid", "magidentifier");
public static Set<String> instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation");
private static BiMap<String, String> relClassMapping = HashBiMap.create();
static {
relClassMapping.put("isAuthorInstitutionOf", "hasAuthorInstitution");
relClassMapping.put("isMergedIn", "merges");
relClassMapping.put("isProducedBy", "produces");
relClassMapping.put("hasParticipant", "isParticipant");
relClassMapping.put("isProvidedBy", "provides");
relClassMapping.put("isRelatedTo", "isRelatedTo");
relClassMapping.put("isAmongTopNSimilarDocuments", "hasAmongTopNSimilarDocuments");
relClassMapping.put("isRelatedTo", "isRelatedTo");
relClassMapping.put("isSupplementTo", "isSupplementedBy");
}
public static String getInverseRelClass(final String relClass) {
String res = relClassMapping.get(relClass);
if (isNotBlank(res)) {
return res;
}
res = relClassMapping.inverse().get(relClass);
if (isNotBlank(res)) {
return res;
}
throw new IllegalArgumentException("unable to find an inverse relationship class for term: " + relClass);
}
private static final String schemeTemplate = "dnet:%s_%s_relations";
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
static {
entityMapping.put(EntityType.publication, MainEntityType.result);
entityMapping.put(EntityType.dataset, MainEntityType.result);
entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result);
entityMapping.put(EntityType.software, MainEntityType.result);
entityMapping.put(EntityType.datasource, MainEntityType.datasource);
entityMapping.put(EntityType.organization, MainEntityType.organization);
entityMapping.put(EntityType.project, MainEntityType.project);
}
public static String getScheme(final String sourceType, final String targetType) {
return String.format(schemeTemplate,
entityMapping.get(EntityType.valueOf(sourceType)).name(),
entityMapping.get(EntityType.valueOf(targetType)).name());
}
public static String getMainType(final String type) {
return entityMapping.get(EntityType.valueOf(type)).name();
}
public static boolean isResult(String type) {
return MainEntityType.result.name().equals(getMainType(type));
}
public static Predicate<String> instanceFilter = s -> instanceFieldFilter.contains(s);
public static EntityRelEntity asRelatedEntity(EntityRelEntity e) {
final DocumentContext j = JsonPath.parse(e.getSource().getOaf());
final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType());
switch (EntityType.valueOf(e.getSource().getType())) {
case publication:
case dataset:
case otherresearchproduct:
case software:
mapTitle(j, re);
re.setDateofacceptance(j.read("$.dateofacceptance.value"));
re.setPublisher(j.read("$.publisher.value"));
JSONArray pids = j.read("$.pid");
re.setPid(pids.stream()
.map(p -> asStructuredProperty((LinkedHashMap<String, Object>) p))
.collect(Collectors.toList()));
re.setResulttype(asQualifier(j.read("$.resulttype")));
JSONArray collfrom = j.read("$.collectedfrom");
re.setCollectedfrom(collfrom.stream()
.map(c -> asKV((LinkedHashMap<String, Object>) c))
.collect(Collectors.toList()));
// will throw exception when the instance is not found
JSONArray instances = j.read("$.instance");
re.setInstances(instances.stream()
.map(i -> {
final LinkedHashMap<String, Object> p = (LinkedHashMap<String, Object>) i;
final Field<String> license = new Field<String>();
license.setValue((String) ((LinkedHashMap<String, Object>) p.get("license")).get("value"));
final Instance instance = new Instance();
instance.setLicense(license);
instance.setAccessright(asQualifier((LinkedHashMap<String, String>) p.get("accessright")));
instance.setInstancetype(asQualifier((LinkedHashMap<String, String>) p.get("instancetype")));
instance.setHostedby(asKV((LinkedHashMap<String, Object>) p.get("hostedby")));
//TODO mapping of distributionlocation
instance.setCollectedfrom(asKV((LinkedHashMap<String, Object>) p.get("collectedfrom")));
Field<String> dateofacceptance = new Field<String>();
dateofacceptance.setValue((String) ((LinkedHashMap<String, Object>) p.get("dateofacceptance")).get("value"));
instance.setDateofacceptance(dateofacceptance);
return instance;
}).collect(Collectors.toList()));
//TODO still to be mapped
//re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
break;
case datasource:
re.setOfficialname(j.read("$.officialname.value"));
re.setWebsiteurl(j.read("$.websiteurl.value"));
re.setDatasourcetype(asQualifier(j.read("$.datasourcetype")));
re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility")));
break;
case organization:
re.setLegalname(j.read("$.legalname.value"));
re.setLegalshortname(j.read("$.legalshortname.value"));
re.setCountry(asQualifier(j.read("$.country")));
break;
case project:
re.setProjectTitle(j.read("$.title.value"));
re.setCode(j.read("$.code.value"));
re.setAcronym(j.read("$.acronym.value"));
re.setContracttype(asQualifier(j.read("$.contracttype")));
JSONArray f = j.read("$.fundingtree");
if (!f.isEmpty()) {
re.setFundingtree(f.stream()
.map(s -> ((LinkedHashMap<String, String>) s).get("value"))
.collect(Collectors.toList()));
}
break;
}
return new EntityRelEntity().setSource(
new TypedRow()
.setSourceId(e.getSource().getSourceId())
.setDeleted(e.getSource().getDeleted())
.setType(e.getSource().getType())
.setOaf(serialize(re)));
}
private static KeyValue asKV(LinkedHashMap<String, Object> j) {
final KeyValue kv = new KeyValue();
kv.setKey((String) j.get("key"));
kv.setValue((String) j.get("value"));
return kv;
}
private static void mapTitle(DocumentContext j, RelatedEntity re) {
final JSONArray a = j.read("$.title");
if (!a.isEmpty()) {
final StructuredProperty sp = asStructuredProperty((LinkedHashMap<String, Object>) a.get(0));
if (StringUtils.isNotBlank(sp.getValue())) {
re.setTitle(sp);
}
}
}
private static StructuredProperty asStructuredProperty(LinkedHashMap<String, Object> j) {
final StructuredProperty sp = new StructuredProperty();
final String value = (String) j.get("value");
if (StringUtils.isNotBlank(value)) {
sp.setValue((String) j.get("value"));
sp.setQualifier(asQualifier((LinkedHashMap<String, String>) j.get("qualifier")));
}
return sp;
}
public static Qualifier asQualifier(LinkedHashMap<String, String> j) {
final Qualifier q = new Qualifier();
final String classid = j.get("classid");
if (StringUtils.isNotBlank(classid)) {
q.setClassid(classid);
}
final String classname = j.get("classname");
if (StringUtils.isNotBlank(classname)) {
q.setClassname(classname);
}
final String schemeid = j.get("schemeid");
if (StringUtils.isNotBlank(schemeid)) {
q.setSchemeid(schemeid);
}
final String schemename = j.get("schemename");
if (StringUtils.isNotBlank(schemename)) {
q.setSchemename(schemename);
}
return q;
}
public static String serialize(final Object o) {
try {
return new ObjectMapper()
.setSerializationInclusion(JsonInclude.Include.NON_NULL)
.writeValueAsString(o);
} catch (JsonProcessingException e) {
throw new IllegalArgumentException("unable to serialize: " + o.toString(), e);
}
}
public static String removePrefix(final String s) {
if (s.contains("|")) return substringAfter(s, "|");
return s;
}
}

View File

@ -0,0 +1,24 @@
package eu.dnetlib.dhp.graph.utils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
public class ISLookupClientFactory {
private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);
public static ISLookUpService getLookUpService(final String isLookupUrl) {
return getServiceStub(ISLookUpService.class, isLookupUrl);
}
@SuppressWarnings("unchecked")
private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) {
log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
jaxWsProxyFactory.setServiceClass(clazz);
jaxWsProxyFactory.setAddress(endpoint);
return (T) jaxWsProxyFactory.create();
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.graph.utils;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import java.util.Comparator;
public class LicenseComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
if (left == null && right == null) return 0;
if (left == null) return 1;
if (right == null) return -1;
String lClass = left.getClassid();
String rClass = right.getClassid();
if (lClass.equals(rClass)) return 0;
if (lClass.equals("OPEN SOURCE")) return -1;
if (rClass.equals("OPEN SOURCE")) return 1;
if (lClass.equals("OPEN")) return -1;
if (rClass.equals("OPEN")) return 1;
if (lClass.equals("6MONTHS")) return -1;
if (rClass.equals("6MONTHS")) return 1;
if (lClass.equals("12MONTHS")) return -1;
if (rClass.equals("12MONTHS")) return 1;
if (lClass.equals("EMBARGO")) return -1;
if (rClass.equals("EMBARGO")) return 1;
if (lClass.equals("RESTRICTED")) return -1;
if (rClass.equals("RESTRICTED")) return 1;
if (lClass.equals("CLOSED")) return -1;
if (rClass.equals("CLOSED")) return 1;
if (lClass.equals("UNKNOWN")) return -1;
if (rClass.equals("UNKNOWN")) return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -0,0 +1,253 @@
package eu.dnetlib.dhp.graph.utils;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import javax.xml.stream.*;
import javax.xml.stream.events.Namespace;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import com.google.common.collect.Lists;
import org.apache.solr.common.SolrInputDocument;
/**
* Optimized version of the document parser, drop in replacement of InputDocumentFactory.
*
* <p>
* Faster because:
* </p>
* <ul>
* <li>Doesn't create a DOM for the full document</li>
* <li>Doesn't execute xpaths agains the DOM</li>
* <li>Quickly serialize the 'result' element directly in a string.</li>
* <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li>
* </ul>
*
* <p>
* This class is fully reentrant and can be invoked in parallel.
* </p>
*
* @author claudio
*
*/
public class StreamingInputDocumentFactory {
private static final String INDEX_FIELD_PREFIX = "__";
private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion";
private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid";
private static final String RESULT = "result";
private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT;
private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
private final static List<String> dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
private static final String DEFAULTDNETRESULT = "dnetResult";
private static final String TARGETFIELDS = "targetFields";
private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier";
private static final String ROOT_ELEMENT = "indexRecord";
private static final int MAX_FIELD_LENGTH = 25000;
private ThreadLocal<XMLInputFactory> inputFactory = ThreadLocal.withInitial(() -> XMLInputFactory.newInstance());
private ThreadLocal<XMLOutputFactory> outputFactory = ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance());
private ThreadLocal<XMLEventFactory> eventFactory = ThreadLocal.withInitial(() -> XMLEventFactory.newInstance());
private String version;
private String dsId;
private String resultName = DEFAULTDNETRESULT;
public StreamingInputDocumentFactory(final String version, final String dsId) {
this(version, dsId, DEFAULTDNETRESULT);
}
public StreamingInputDocumentFactory(final String version, final String dsId, final String resultName) {
this.version = version;
this.dsId = dsId;
this.resultName = resultName;
}
public SolrInputDocument parseDocument(final String inputDocument) {
final StringWriter results = new StringWriter();
final List<Namespace> nsList = Lists.newLinkedList();
try {
XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument));
final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>());
while (parser.hasNext()) {
final XMLEvent event = parser.nextEvent();
if ((event != null) && event.isStartElement()) {
final String localName = event.asStartElement().getName().getLocalPart();
if (ROOT_ELEMENT.equals(localName)) {
nsList.addAll(getNamespaces(event));
} else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) {
final XMLEvent text = parser.nextEvent();
String recordId = getText(text);
indexDocument.addField(INDEX_RECORD_ID, recordId);
} else if (TARGETFIELDS.equals(localName)) {
parseTargetFields(indexDocument, parser);
} else if (resultName.equals(localName)) {
copyResult(indexDocument, results, parser, nsList, resultName);
}
}
}
if (version != null) {
indexDocument.addField(DS_VERSION, version);
}
if (dsId != null) {
indexDocument.addField(DS_ID, dsId);
}
if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
indexDocument.clear();
System.err.println("missing indexrecord id:\n" + inputDocument);
}
return indexDocument;
} catch (XMLStreamException e) {
return new SolrInputDocument();
}
}
private List<Namespace> getNamespaces(final XMLEvent event) {
final List<Namespace> res = Lists.newLinkedList();
@SuppressWarnings("unchecked")
Iterator<Namespace> nsIter = event.asStartElement().getNamespaces();
while (nsIter.hasNext()) {
Namespace ns = nsIter.next();
res.add(ns);
}
return res;
}
/**
* Parse the targetFields block and add fields to the solr document.
*
* @param indexDocument
* @param parser
* @throws XMLStreamException
*/
protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException {
boolean hasFields = false;
while (parser.hasNext()) {
final XMLEvent targetEvent = parser.nextEvent();
if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) {
break;
}
if (targetEvent.isStartElement()) {
final String fieldName = targetEvent.asStartElement().getName().getLocalPart();
final XMLEvent text = parser.nextEvent();
String data = getText(text);
addField(indexDocument, fieldName, data);
hasFields = true;
}
}
if (!hasFields) {
indexDocument.clear();
}
}
/**
* Copy the /indexRecord/result element and children, preserving namespace declarations etc.
*
* @param indexDocument
* @param results
* @param parser
* @param nsList
* @throws XMLStreamException
*/
protected void copyResult(final SolrInputDocument indexDocument,
final StringWriter results,
final XMLEventReader parser,
final List<Namespace> nsList,
final String dnetResult) throws XMLStreamException {
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results);
for (Namespace ns : nsList) {
eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI());
}
StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator());
// new root record
writer.add(newRecord);
// copy the rest as it is
while (parser.hasNext()) {
final XMLEvent resultEvent = parser.nextEvent();
// TODO: replace with depth tracking instead of close tag tracking.
if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) {
writer.add(eventFactory.get().createEndElement("", null, RESULT));
break;
}
writer.add(resultEvent);
}
writer.close();
indexDocument.addField(INDEX_RESULT, results.toString());
}
/**
* Helper used to add a field to a solr doc. It avoids to add empy fields
*
* @param indexDocument
* @param field
* @param value
*/
private final void addField(final SolrInputDocument indexDocument, final String field, final String value) {
String cleaned = value.trim();
if (!cleaned.isEmpty()) {
// log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n");
indexDocument.addField(field.toLowerCase(), cleaned);
}
}
/**
* Helper used to get the string from a text element.
*
* @param text
* @return the
*/
protected final String getText(final XMLEvent text) {
if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart());
return "";
final String data = text.asCharacters().getData();
if (data != null && data.length() > MAX_FIELD_LENGTH) {
return data.substring(0, MAX_FIELD_LENGTH);
}
return data;
}
}

View File

@ -0,0 +1,107 @@
package eu.dnetlib.dhp.graph.utils;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import org.apache.commons.lang3.StringUtils;
import org.stringtemplate.v4.ST;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix;
import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.escapeXml;
public class TemplateFactory {
private TemplateResources resources;
private final static char DELIMITER = '$';
public TemplateFactory() {
try {
resources = new TemplateResources();
} catch (IOException e) {
throw new IllegalStateException(e);
}
}
public String buildBody(final String type, final List<String> metadata, final List<String> rels, final List<String> children, final List<String> extraInfo) {
ST body = getTemplate(resources.getEntity());
body.add("name", type);
body.add("metadata", metadata);
body.add("rels", rels);
body.add("children", children);
body.add("extrainfo", extraInfo);
return body.render();
}
public String getChild(final String name, final String id, final List<String> metadata) {
return getTemplate(resources.getChild())
.add("name", name)
.add("hasId", !(id == null))
.add("id", id != null ? escapeXml(removePrefix(id)) : "")
.add("metadata", metadata)
.render();
}
public String buildRecord(
final OafEntity entity,
final String schemaLocation,
final String body) {
return getTemplate(resources.getRecord())
.add("id", escapeXml(removePrefix(entity.getId())))
.add("dateofcollection", entity.getDateofcollection())
.add("dateoftransformation", entity.getDateoftransformation())
.add("schemaLocation", schemaLocation)
.add("it", body)
.render();
}
public String getRel(final String type,
final String objIdentifier,
final Collection<String> fields,
final String semanticclass,
final String semantischeme,
final DataInfo info) {
return getTemplate(resources.getRel())
.add("type", type)
.add("objIdentifier", escapeXml(removePrefix(objIdentifier)))
.add("class", semanticclass)
.add("scheme", semantischeme)
.add("metadata", fields)
.add("inferred", info.getInferred())
.add("trust", info.getTrust())
.add("inferenceprovenance", info.getInferenceprovenance())
.add("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")
.render();
}
public String getInstance(final String resultId, final List<String> instancemetadata, final List<String> webresources) {
return getTemplate(resources.getInstance())
.add("instanceId", escapeXml(removePrefix(resultId)))
.add("metadata", instancemetadata)
.add("webresources", webresources
.stream()
.filter(StringUtils::isNotBlank)
.map(w -> getWebResource(w))
.collect(Collectors.toList()))
.render();
}
private String getWebResource(final String identifier) {
return getTemplate(resources.getWebresource())
.add("identifier", escapeXml(identifier))
.render();
}
// HELPERS
private ST getTemplate(final String res) {
return new ST(res, DELIMITER, DELIMITER);
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.graph.utils;
import com.google.common.io.Resources;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
public class TemplateResources {
private String record = read("eu/dnetlib/dhp/graph/template/record.st");
private String instance = read("eu/dnetlib/dhp/graph/template/instance.st");
private String rel = read("eu/dnetlib/dhp/graph/template/rel.st");
private String webresource = read("eu/dnetlib/dhp/graph/template/webresource.st");
private String child = read("eu/dnetlib/dhp/graph/template/child.st");
private String entity = read("eu/dnetlib/dhp/graph/template/entity.st");
private static String read(final String classpathResource) throws IOException {
return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8);
}
public TemplateResources() throws IOException {
}
public String getEntity() {
return entity;
}
public String getRecord() {
return record;
}
public String getInstance() {
return instance;
}
public String getRel() {
return rel;
}
public String getWebresource() {
return webresource;
}
public String getChild() {
return child;
}
}

View File

@ -0,0 +1,962 @@
package eu.dnetlib.dhp.graph.utils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.mycila.xmltool.XMLDoc;
import com.mycila.xmltool.XMLTag;
import eu.dnetlib.dhp.graph.model.JoinedEntity;
import eu.dnetlib.dhp.graph.model.RelatedEntity;
import eu.dnetlib.dhp.graph.model.Tuple2;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.*;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.SAXReader;
import org.dom4j.io.XMLWriter;
import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.IOException;
import java.io.Serializable;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.*;
import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.*;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
import static org.apache.commons.lang3.StringUtils.substringBefore;
public class XmlRecordFactory implements Serializable {
private Set<String> specialDatasourceTypes;
private ContextMapper contextMapper;
private String schemaLocation;
private Set<String> contextes = Sets.newHashSet();
private boolean indent = false;
public XmlRecordFactory(
final ContextMapper contextMapper, final boolean indent,
final String schemaLocation, final Set<String> otherDatasourceTypesUForUI) {
this.contextMapper = contextMapper;
this.schemaLocation = schemaLocation;
this.specialDatasourceTypes = otherDatasourceTypesUForUI;
this.indent = indent;
}
public String build(final JoinedEntity je) {
final OafEntity entity = je.getEntity();
TemplateFactory templateFactory = new TemplateFactory();
try {
final List<String> metadata = metadata(je.getType(), entity);
// rels has to be processed before the contexts because they enrich the contextMap with the funding info.
final List<String> relations = listRelations(je, templateFactory);
metadata.addAll(buildContexts(getMainType(je.getType())));
metadata.add(parseDataInfo(entity.getDataInfo()));
final String body = templateFactory.buildBody(
getMainType(je.getType()),
metadata,
relations,
listChildren(je, templateFactory), listExtraInfo(je));
return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent);
} catch (final Throwable e) {
throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e);
}
}
private String printXML(String xml, boolean indent) {
try {
final Document doc = new SAXReader().read(new StringReader(xml));
OutputFormat format = indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat();
format.setExpandEmptyElements(false);
format.setSuppressDeclaration(true);
StringWriter sw = new StringWriter();
XMLWriter writer = new XMLWriter(sw, format);
writer.write(doc);
return sw.toString();
} catch (IOException | DocumentException e) {
throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e);
}
}
private List<String> metadata(final String type, final OafEntity entity) {
final List<String> metadata = Lists.newArrayList();
if (entity.getCollectedfrom() != null) {
metadata.addAll(entity.getCollectedfrom()
.stream()
.map(kv -> mapKeyValue("collectedfrom", kv))
.collect(Collectors.toList()));
}
if (entity.getOriginalId() != null) {
metadata.addAll(entity.getOriginalId()
.stream()
.map(s -> asXmlElement("originalId", s))
.collect(Collectors.toList()));
}
if (entity.getPid() != null) {
metadata.addAll(entity.getPid()
.stream()
.map(p -> mapStructuredProperty("pid", p))
.collect(Collectors.toList()));
}
if (GraphMappingUtils.isResult(type)) {
final Result r = (Result) entity;
if (r.getTitle() != null) {
metadata.addAll(r.getTitle()
.stream()
.map(t -> mapStructuredProperty("title", t))
.collect(Collectors.toList()));
}
if (r.getBestaccessright() != null) {
metadata.add(mapQualifier("bestaccessright", r.getBestaccessright()));
}
if (r.getAuthor() != null) {
metadata.addAll(r.getAuthor()
.stream()
.map(a -> {
final StringBuilder sb = new StringBuilder("<creator rank=\"" + a.getRank() + "\"");
if (isNotBlank(a.getName())) {
sb.append(" name=\"" + escapeXml(a.getName()) + "\"");
}
if (isNotBlank(a.getSurname())) {
sb.append(" surname=\"" + escapeXml(a.getSurname()) + "\"");
}
if (a.getPid() != null) {
a.getPid().stream()
.filter(sp -> isNotBlank(sp.getQualifier().getClassid()) && isNotBlank(sp.getValue()))
.forEach(sp -> {
String pidType = escapeXml(sp.getQualifier().getClassid()).replaceAll("\\W", "");
String pidValue = escapeXml(sp.getValue());
// ugly hack: some records provide swapped pidtype and pidvalue
if (authorPidTypes.contains(pidValue.toLowerCase().trim())) {
sb.append(String.format(" %s=\"%s\"", pidValue, pidType));
} else {
pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", "");
if (isNotBlank(pidType)) {
sb.append(String.format(" %s=\"%s\"",
pidType,
pidValue.toLowerCase().replaceAll("orcid", "")));
}
}
});
}
sb.append(">" + escapeXml(a.getFullname()) + "</creator>");
return sb.toString();
}).collect(Collectors.toList()));
}
if (r.getContributor() != null) {
metadata.addAll(r.getContributor()
.stream()
.map(c -> asXmlElement("contributor", c.getValue()))
.collect(Collectors.toList()));
}
if (r.getCountry() != null) {
metadata.addAll(r.getCountry()
.stream()
.map(c -> mapQualifier("country", c))
.collect(Collectors.toList()));
}
if (r.getCoverage() != null) {
metadata.addAll(r.getCoverage()
.stream()
.map(c -> asXmlElement("coverage", c.getValue()))
.collect(Collectors.toList()));
}
if (r.getDateofacceptance() != null) {
metadata.add(asXmlElement("dateofacceptance", r.getDateofacceptance().getValue()));
}
if (r.getDescription() != null) {
metadata.addAll(r.getDescription()
.stream()
.map(c -> asXmlElement("description", c.getValue()))
.collect(Collectors.toList()));
}
if (r.getEmbargoenddate() != null) {
metadata.add(asXmlElement("embargoenddate", r.getEmbargoenddate().getValue()));
}
if (r.getSubject() != null) {
metadata.addAll(r.getSubject()
.stream()
.map(s -> mapStructuredProperty("subject", s))
.collect(Collectors.toList()));
}
if (r.getLanguage() != null) {
metadata.add(mapQualifier("language", r.getLanguage()));
}
if (r.getRelevantdate() != null) {
metadata.addAll(r.getRelevantdate()
.stream()
.map(s -> mapStructuredProperty("relevantdate", s))
.collect(Collectors.toList()));
}
if (r.getPublisher() != null) {
metadata.add(asXmlElement("publisher", r.getPublisher().getValue()));
}
if (r.getSource() != null) {
metadata.addAll(r.getSource()
.stream()
.map(c -> asXmlElement("source", c.getValue()))
.collect(Collectors.toList()));
}
if (r.getFormat() != null) {
metadata.addAll(r.getFormat()
.stream()
.map(c -> asXmlElement("format", c.getValue()))
.collect(Collectors.toList()));
}
if (r.getResulttype() != null) {
metadata.add(mapQualifier("resulttype", r.getResulttype()));
}
if (r.getResourcetype() != null) {
metadata.add(mapQualifier("resourcetype", r.getResourcetype()));
}
metadata.add(mapQualifier("bestaccessright", getBestAccessright(r)));
if (r.getContext() != null) {
contextes.addAll(r.getContext()
.stream()
.map(c -> c.getId())
.collect(Collectors.toList()));
if (contextes.contains("dh-ch::subcommunity::2")) {
contextes.add("clarin");
}
}
}
switch (EntityType.valueOf(type)) {
case publication:
final Publication pub = (Publication) entity;
if (pub.getJournal() != null) {
final Journal j = pub.getJournal();
metadata.add(mapJournal(j));
}
break;
case dataset:
final Dataset d = (Dataset) entity;
if (d.getDevice() != null) {
metadata.add(asXmlElement("device", d.getDevice().getValue()));
}
if (d.getLastmetadataupdate() != null) {
metadata.add(asXmlElement("lastmetadataupdate", d.getLastmetadataupdate().getValue()));
}
if (d.getMetadataversionnumber() != null) {
metadata.add(asXmlElement("metadataversionnumber", d.getMetadataversionnumber().getValue()));
}
if (d.getSize() != null) {
metadata.add(asXmlElement("size", d.getSize().getValue()));
}
if (d.getStoragedate() != null) {
metadata.add(asXmlElement("storagedate", d.getStoragedate().getValue()));
}
if (d.getVersion() != null) {
metadata.add(asXmlElement("version", d.getVersion().getValue()));
}
//TODO d.getGeolocation()
break;
case otherresearchproduct:
final OtherResearchProduct orp = (OtherResearchProduct) entity;
if (orp.getContactperson() != null) {
metadata.addAll(orp.getContactperson()
.stream()
.map(c -> asXmlElement("contactperson", c.getValue()))
.collect(Collectors.toList()));
}
if (orp.getContactgroup() != null) {
metadata.addAll(orp.getContactgroup()
.stream()
.map(c -> asXmlElement("contactgroup", c.getValue()))
.collect(Collectors.toList()));
}
if (orp.getTool() != null) {
metadata.addAll(orp.getTool()
.stream()
.map(c -> asXmlElement("tool", c.getValue()))
.collect(Collectors.toList()));
}
break;
case software:
final Software s = (Software) entity;
if (s.getDocumentationUrl() != null) {
metadata.addAll(s.getDocumentationUrl()
.stream()
.map(c -> asXmlElement("documentationUrl", c.getValue()))
.collect(Collectors.toList()));
}
if (s.getLicense() != null) {
metadata.addAll(s.getLicense()
.stream()
.map(l -> mapStructuredProperty("license", l))
.collect(Collectors.toList()));
}
if (s.getCodeRepositoryUrl() != null) {
metadata.add(asXmlElement("codeRepositoryUrl", s.getCodeRepositoryUrl().getValue()));
}
if (s.getProgrammingLanguage() != null) {
metadata.add(mapQualifier("programmingLanguage", s.getProgrammingLanguage()));
}
break;
case datasource:
final Datasource ds = (Datasource) entity;
if (ds.getDatasourcetype() != null) {
mapDatasourceType(metadata, ds.getDatasourcetype());
}
if (ds.getOpenairecompatibility() != null) {
metadata.add(mapQualifier("openairecompatibility", ds.getOpenairecompatibility()));
}
if (ds.getOfficialname() != null) {
metadata.add(asXmlElement("officialname", ds.getOfficialname().getValue()));
}
if (ds.getEnglishname() != null) {
metadata.add(asXmlElement("englishname", ds.getEnglishname().getValue()));
}
if (ds.getWebsiteurl() != null) {
metadata.add(asXmlElement("websiteurl", ds.getWebsiteurl().getValue()));
}
if (ds.getLogourl() != null) {
metadata.add(asXmlElement("logourl", ds.getLogourl().getValue()));
}
if (ds.getContactemail() != null) {
metadata.add(asXmlElement("contactemail", ds.getContactemail().getValue()));
}
if (ds.getNamespaceprefix() != null) {
metadata.add(asXmlElement("namespaceprefix", ds.getNamespaceprefix().getValue()));
}
if (ds.getLatitude() != null) {
metadata.add(asXmlElement("latitude", ds.getLatitude().getValue()));
}
if (ds.getLongitude() != null) {
metadata.add(asXmlElement("longitude", ds.getLongitude().getValue()));
}
if (ds.getDateofvalidation() != null) {
metadata.add(asXmlElement("dateofvalidation", ds.getDateofvalidation().getValue()));
}
if (ds.getDescription() != null) {
metadata.add(asXmlElement("description", ds.getDescription().getValue()));
}
if (ds.getOdnumberofitems() != null) {
metadata.add(asXmlElement("odnumberofitems", ds.getOdnumberofitems().getValue()));
}
if (ds.getOdnumberofitemsdate() != null) {
metadata.add(asXmlElement("odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue()));
}
if (ds.getOdpolicies() != null) {
metadata.add(asXmlElement("odpolicies", ds.getOdpolicies().getValue()));
}
if (ds.getOdlanguages() != null) {
metadata.addAll(ds.getOdlanguages()
.stream()
.map(c -> asXmlElement("odlanguages", c.getValue()))
.collect(Collectors.toList()));
}
if (ds.getOdcontenttypes() != null) {
metadata.addAll(ds.getOdcontenttypes()
.stream()
.map(c -> asXmlElement("odcontenttypes", c.getValue()))
.collect(Collectors.toList()));
}
if (ds.getAccessinfopackage() != null) {
metadata.addAll(ds.getAccessinfopackage()
.stream()
.map(c -> asXmlElement("accessinfopackage", c.getValue()))
.collect(Collectors.toList()));
}
if (ds.getReleaseenddate() != null) {
metadata.add(asXmlElement("releasestartdate", ds.getReleaseenddate().getValue()));
}
if (ds.getReleaseenddate() != null) {
metadata.add(asXmlElement("releaseenddate", ds.getReleaseenddate().getValue()));
}
if (ds.getMissionstatementurl() != null) {
metadata.add(asXmlElement("missionstatementurl", ds.getMissionstatementurl().getValue()));
}
if (ds.getDataprovider() != null) {
metadata.add(asXmlElement("dataprovider", ds.getDataprovider().getValue().toString()));
}
if (ds.getServiceprovider() != null) {
metadata.add(asXmlElement("serviceprovider", ds.getServiceprovider().getValue().toString()));
}
if (ds.getDatabaseaccesstype() != null) {
metadata.add(asXmlElement("databaseaccesstype", ds.getDatabaseaccesstype().getValue()));
}
if (ds.getDatauploadtype() != null) {
metadata.add(asXmlElement("datauploadtype", ds.getDatauploadtype().getValue()));
}
if (ds.getDatabaseaccessrestriction() != null) {
metadata.add(asXmlElement("databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue()));
}
if (ds.getDatauploadrestriction() != null) {
metadata.add(asXmlElement("datauploadrestriction", ds.getDatauploadrestriction().getValue()));
}
if (ds.getVersioning() != null) {
metadata.add(asXmlElement("versioning", ds.getVersioning().getValue().toString()));
}
if (ds.getCitationguidelineurl() != null) {
metadata.add(asXmlElement("citationguidelineurl", ds.getCitationguidelineurl().getValue()));
}
if (ds.getQualitymanagementkind() != null) {
metadata.add(asXmlElement("qualitymanagementkind", ds.getQualitymanagementkind().getValue()));
}
if (ds.getPidsystems() != null) {
metadata.add(asXmlElement("pidsystems", ds.getPidsystems().getValue()));
}
if (ds.getCertificates() != null) {
metadata.add(asXmlElement("certificates", ds.getCertificates().getValue()));
}
if (ds.getPolicies() != null) {
metadata.addAll(ds.getPolicies()
.stream()
.map(kv -> mapKeyValue("policies", kv))
.collect(Collectors.toList()));
}
if (ds.getJournal() != null) {
metadata.add(mapJournal(ds.getJournal()));
}
if (ds.getSubjects() != null) {
metadata.addAll(ds.getSubjects()
.stream()
.map(sp -> mapStructuredProperty("subject", sp))
.collect(Collectors.toList()));
}
break;
case organization:
final Organization o = (Organization) entity;
if (o.getLegalshortname() != null) {
metadata.add(asXmlElement("legalshortname", o.getLegalshortname().getValue()));
}
if (o.getLegalname() != null) {
metadata.add(asXmlElement("legalname", o.getLegalname().getValue()));
}
if (o.getAlternativeNames() != null) {
metadata.addAll(o.getAlternativeNames()
.stream()
.map(c -> asXmlElement("alternativeNames", c.getValue()))
.collect(Collectors.toList()));
}
if (o.getWebsiteurl() != null) {
metadata.add(asXmlElement("websiteurl", o.getWebsiteurl().getValue()));
}
if (o.getLogourl() != null) {
metadata.add(asXmlElement("websiteurl", o.getLogourl().getValue()));
}
if (o.getEclegalbody() != null) {
metadata.add(asXmlElement("eclegalbody", o.getEclegalbody().getValue()));
}
if (o.getEclegalperson() != null) {
metadata.add(asXmlElement("eclegalperson", o.getEclegalperson().getValue()));
}
if (o.getEcnonprofit() != null) {
metadata.add(asXmlElement("ecnonprofit", o.getEcnonprofit().getValue()));
}
if (o.getEcresearchorganization() != null) {
metadata.add(asXmlElement("ecresearchorganization", o.getEcresearchorganization().getValue()));
}
if (o.getEchighereducation() != null) {
metadata.add(asXmlElement("echighereducation", o.getEchighereducation().getValue()));
}
if (o.getEcinternationalorganization() != null) {
metadata.add(asXmlElement("ecinternationalorganizationeurinterests", o.getEcinternationalorganization().getValue()));
}
if (o.getEcinternationalorganization() != null) {
metadata.add(asXmlElement("ecinternationalorganization", o.getEcinternationalorganization().getValue()));
}
if (o.getEcenterprise() != null) {
metadata.add(asXmlElement("ecenterprise", o.getEcenterprise().getValue()));
}
if (o.getEcsmevalidated() != null) {
metadata.add(asXmlElement("ecsmevalidated", o.getEcsmevalidated().getValue()));
}
if (o.getEcnutscode() != null) {
metadata.add(asXmlElement("ecnutscode", o.getEcnutscode().getValue()));
}
if (o.getCountry() != null) {
metadata.add(mapQualifier("country", o.getCountry()));
}
break;
case project:
final Project p = (Project) entity;
if (p.getWebsiteurl() != null) {
metadata.add(asXmlElement("websiteurl", p.getWebsiteurl().getValue()));
}
if (p.getCode() != null) {
metadata.add(asXmlElement("code", p.getCode().getValue()));
}
if (p.getAcronym() != null) {
metadata.add(asXmlElement("acronym", p.getAcronym().getValue()));
}
if (p.getTitle() != null) {
metadata.add(asXmlElement("title", p.getTitle().getValue()));
}
if (p.getStartdate() != null) {
metadata.add(asXmlElement("startdate", p.getStartdate().getValue()));
}
if (p.getEnddate() != null) {
metadata.add(asXmlElement("enddate", p.getEnddate().getValue()));
}
if (p.getCallidentifier() != null) {
metadata.add(asXmlElement("callidentifier", p.getCallidentifier().getValue()));
}
if (p.getKeywords() != null) {
metadata.add(asXmlElement("keywords", p.getKeywords().getValue()));
}
if (p.getDuration() != null) {
metadata.add(asXmlElement("duration", p.getDuration().getValue()));
}
if (p.getEcarticle29_3() != null) {
metadata.add(asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue()));
}
if (p.getSubjects() != null) {
metadata.addAll(p.getSubjects()
.stream()
.map(sp -> mapStructuredProperty("subject", sp))
.collect(Collectors.toList()));
}
if (p.getContracttype() != null) {
metadata.add(mapQualifier("contracttype", p.getContracttype()));
}
if (p.getEcsc39() != null) {
metadata.add(asXmlElement("ecsc39", p.getEcsc39().getValue()));
}
if (p.getContactfullname() != null) {
metadata.add(asXmlElement("contactfullname", p.getContactfullname().getValue()));
}
if (p.getContactfax() != null) {
metadata.add(asXmlElement("contactfax", p.getContactfax().getValue()));
}
if (p.getContactphone() != null) {
metadata.add(asXmlElement("contactphone", p.getContactphone().getValue()));
}
if (p.getContactemail() != null) {
metadata.add(asXmlElement("contactemail", p.getContactemail().getValue()));
}
if (p.getSummary() != null) {
metadata.add(asXmlElement("summary", p.getSummary().getValue()));
}
if (p.getCurrency() != null) {
metadata.add(asXmlElement("currency", p.getCurrency().getValue()));
}
if (p.getTotalcost() != null) {
metadata.add(asXmlElement("totalcost", p.getTotalcost().toString()));
}
if (p.getFundedamount() != null) {
metadata.add(asXmlElement("fundedamount", p.getFundedamount().toString()));
}
if (p.getFundingtree() != null) {
metadata.addAll(p.getFundingtree()
.stream()
.map(ft -> asXmlElement("fundingtree", ft.getValue()))
.collect(Collectors.toList()));
}
break;
default:
throw new IllegalArgumentException("invalid entity type: " + type);
}
return metadata;
}
private void mapDatasourceType(List<String> metadata, final Qualifier dsType) {
metadata.add(mapQualifier("datasourcetype", dsType));
if (specialDatasourceTypes.contains(dsType.getClassid())) {
dsType.setClassid("other");
dsType.setClassname("other");
}
metadata.add(mapQualifier("datasourcetypeui", dsType));
}
private Qualifier getBestAccessright(final Result r) {
Qualifier bestAccessRight = new Qualifier();
bestAccessRight.setClassid("UNKNOWN");
bestAccessRight.setClassname("not available");
bestAccessRight.setSchemeid("dnet:access_modes");
bestAccessRight.setSchemename("dnet:access_modes");
final LicenseComparator lc = new LicenseComparator();
for (final Instance instance : r.getInstance()) {
if (lc.compare(bestAccessRight, instance.getAccessright()) > 0) {
bestAccessRight = instance.getAccessright();
}
}
return bestAccessRight;
}
private List<String> listRelations(final JoinedEntity je, TemplateFactory templateFactory) {
final List<String> rels = Lists.newArrayList();
for (final Tuple2 link : je.getLinks()) {
final Relation rel = link.getRelation();
final RelatedEntity re = link.getRelatedEntity();
final String targetType = link.getRelatedEntity().getType();
final List<String> metadata = Lists.newArrayList();
switch (EntityType.valueOf(targetType)) {
case publication:
case dataset:
case otherresearchproduct:
case software:
if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) {
metadata.add(mapStructuredProperty("title", re.getTitle()));
}
if (isNotBlank(re.getDateofacceptance())) {
metadata.add(asXmlElement("dateofacceptance", re.getDateofacceptance()));
}
if (isNotBlank(re.getPublisher())) {
metadata.add(asXmlElement("publisher", re.getPublisher()));
}
if (isNotBlank(re.getCodeRepositoryUrl())) {
metadata.add(asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
}
if (re.getResulttype() != null & !re.getResulttype().isBlank()) {
metadata.add(mapQualifier("resulttype", re.getResulttype()));
}
if (re.getCollectedfrom() != null) {
metadata.addAll(re.getCollectedfrom()
.stream()
.map(kv -> mapKeyValue("collectedfrom", kv))
.collect(Collectors.toList()));
}
if (re.getPid() != null) {
metadata.addAll(re.getPid()
.stream()
.map(p -> mapStructuredProperty("pid", p))
.collect(Collectors.toList()));
}
break;
case datasource:
if (isNotBlank(re.getOfficialname())) {
metadata.add(asXmlElement("officialname", re.getOfficialname()));
}
if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) {
mapDatasourceType(metadata, re.getDatasourcetype());
}
if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) {
metadata.add(mapQualifier("openairecompatibility", re.getOpenairecompatibility()));
}
break;
case organization:
if (isNotBlank(re.getLegalname())) {
metadata.add(asXmlElement("legalname", re.getLegalname()));
}
if (isNotBlank(re.getLegalshortname())) {
metadata.add(asXmlElement("legalshortname", re.getLegalshortname()));
}
if (re.getCountry() != null & !re.getCountry().isBlank()) {
metadata.add(mapQualifier("country", re.getCountry()));
}
break;
case project:
if (isNotBlank(re.getProjectTitle())) {
metadata.add(asXmlElement("title", re.getProjectTitle()));
}
if (isNotBlank(re.getCode())) {
metadata.add(asXmlElement("code", re.getCode()));
}
if (isNotBlank(re.getAcronym())) {
metadata.add(asXmlElement("acronym", re.getAcronym()));
}
if (re.getContracttype() != null & !re.getContracttype().isBlank()) {
metadata.add(mapQualifier("contracttype", re.getContracttype()));
}
if (re.getFundingtree() != null) {
metadata.addAll(re.getFundingtree()
.stream()
.peek(ft -> fillContextMap(ft))
.map(ft -> getRelFundingTree(ft))
.collect(Collectors.toList()));
}
break;
default:
throw new IllegalArgumentException("invalid target type: " + targetType);
}
final DataInfo info = rel.getDataInfo();
rels.add(templateFactory.getRel(
targetType,
rel.getTarget(),
Sets.newHashSet(metadata),
getInverseRelClass(rel.getRelClass()),
getScheme(targetType, re.getType()),
info));
}
return rels;
}
private List<String> listChildren(final JoinedEntity je, TemplateFactory templateFactory) {
final List<String> children = Lists.newArrayList();
if (MainEntityType.result.toString().equals(getMainType(je.getType()))) {
final List<Instance> instances = ((Result) je.getEntity()).getInstance();
if (instances != null) {
for (final Instance instance : ((Result) je.getEntity()).getInstance()) {
final List<String> fields = Lists.newArrayList();
if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) {
fields.add(mapQualifier("accessright", instance.getAccessright()));
}
if (instance.getCollectedfrom() != null) {
fields.add(mapKeyValue("collectedfrom", instance.getCollectedfrom()));
}
if (instance.getHostedby() != null) {
fields.add(mapKeyValue("hostedby", instance.getHostedby()));
}
if (instance.getDateofacceptance() != null && isNotBlank(instance.getDateofacceptance().getValue())) {
fields.add(asXmlElement("dateofacceptance", instance.getDateofacceptance().getValue()));
}
if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) {
fields.add(mapQualifier("instancetype", instance.getInstancetype()));
}
if (isNotBlank(instance.getDistributionlocation())) {
fields.add(asXmlElement("distributionlocation", instance.getDistributionlocation()));
}
if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) {
fields.add(asXmlElement("refereed", instance.getRefereed().getValue()));
}
if (instance.getProcessingchargeamount() != null && isNotBlank(instance.getProcessingchargeamount().getValue())) {
fields.add(asXmlElement("processingchargeamount", instance.getProcessingchargeamount().getValue()));
}
if (instance.getProcessingchargecurrency() != null && isNotBlank(instance.getProcessingchargecurrency().getValue())) {
fields.add(asXmlElement("processingchargecurrency", instance.getProcessingchargecurrency().getValue()));
}
children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl()));
}
}
final List<ExternalReference> ext = ((Result) je.getEntity()).getExternalReference();
if (ext != null) {
for (final ExternalReference er : ((Result) je.getEntity()).getExternalReference()) {
final List<String> fields = Lists.newArrayList();
if (isNotBlank(er.getSitename())) {
fields.add(asXmlElement("sitename", er.getSitename()));
}
if (isNotBlank(er.getLabel())) {
fields.add(asXmlElement("label", er.getLabel()));
}
if (isNotBlank(er.getUrl())) {
fields.add(asXmlElement("url", er.getUrl()));
}
if (isNotBlank(er.getDescription())) {
fields.add(asXmlElement("description", er.getDescription()));
}
if (isNotBlank(er.getUrl())) {
fields.add(mapQualifier("qualifier", er.getQualifier()));
}
if (isNotBlank(er.getRefidentifier())) {
fields.add(asXmlElement("refidentifier", er.getRefidentifier()));
}
if (isNotBlank(er.getQuery())) {
fields.add(asXmlElement("query", er.getQuery()));
}
children.add(templateFactory.getChild("externalreference", null, fields));
}
}
}
return children;
}
private List<String> listExtraInfo(JoinedEntity je) {
final List<ExtraInfo> extraInfo = je.getEntity().getExtraInfo();
return extraInfo != null ? extraInfo
.stream()
.map(e -> mapExtraInfo(e))
.collect(Collectors.toList()) : Lists.newArrayList();
}
private List<String> buildContexts(final String type) {
final List<String> res = Lists.newArrayList();
if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) {
XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot");
for (final String context : contextes) {
String id = "";
for (final String token : Splitter.on("::").split(context)) {
id += token;
final ContextDef def = contextMapper.get(id);
if (def == null) {
continue;
// throw new IllegalStateException(String.format("cannot find context for id '%s'", id));
}
if (def.getName().equals("context")) {
final String xpath = "//context/@id='" + def.getId() + "'";
if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) {
document = addContextDef(document.gotoRoot(), def);
}
}
if (def.getName().equals("category")) {
final String rootId = substringBefore(def.getId(), "::");
document = addContextDef(document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), def);
}
if (def.getName().equals("concept")) {
document = addContextDef(document, def).gotoParent();
}
id += "::";
}
}
final Transformer transformer = getTransformer();
for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) {
try {
res.add(asStringElement(x, transformer));
} catch (final TransformerException e) {
throw new RuntimeException(e);
}
}
}
return res;
}
private Transformer getTransformer() {
try {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
return transformer;
} catch (TransformerConfigurationException e) {
throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e);
}
}
private XMLTag addContextDef(final XMLTag tag, final ContextDef def) {
tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel());
if ((def.getType() != null) && !def.getType().isEmpty()) {
tag.addAttribute("type", def.getType());
}
return tag;
}
private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) throws TransformerException {
final StringWriter buffer = new StringWriter();
transformer.transform(new DOMSource(element), new StreamResult(buffer));
return buffer.toString();
}
private void fillContextMap(final String xmlTree) {
Document fundingPath;
try {
fundingPath = new SAXReader().read(new StringReader(xmlTree));
} catch (final DocumentException e) {
throw new RuntimeException(e);
}
try {
final Node funder = fundingPath.selectSingleNode("//funder");
if (funder != null) {
final String funderShortName = funder.valueOf("./shortname");
contextes.add(funderShortName);
contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding"));
final Node level0 = fundingPath.selectSingleNode("//funding_level_0");
if (level0 != null) {
final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name"));
contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", ""));
final Node level1 = fundingPath.selectSingleNode("//funding_level_1");
if (level1 == null) {
contextes.add(level0Id);
} else {
final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name"));
contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", ""));
final Node level2 = fundingPath.selectSingleNode("//funding_level_2");
if (level2 == null) {
contextes.add(level1Id);
} else {
final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name"));
contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", ""));
contextes.add(level2Id);
}
}
}
}
} catch (final NullPointerException e) {
throw new IllegalArgumentException("malformed funding path: " + xmlTree, e);
}
}
@SuppressWarnings("unchecked")
private String getRelFundingTree(final String xmlTree) {
String funding = "<funding>";
try {
final Document ftree = new SAXReader().read(new StringReader(xmlTree));
funding = "<funding>";
funding += getFunderElement(ftree);
for (final Object o : Lists.reverse(ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) {
final Element e = (Element) o;
final String _id = e.valueOf("./id");
funding += "<" + e.getName() + " name=\"" + escapeXml(e.valueOf("./name")) + "\">" + escapeXml(_id) + "</" + e.getName() + ">";
}
} catch (final DocumentException e) {
throw new IllegalArgumentException("unable to parse funding tree: " + xmlTree + "\n" + e.getMessage());
} finally {
funding += "</funding>";
}
return funding;
}
private String getFunderElement(final Document ftree) {
final String funderId = ftree.valueOf("//fundingtree/funder/id/text()");
final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname/text()");
final String funderName = ftree.valueOf("//fundingtree/funder/name/text()");
final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction/text()");
return "<funder id=\"" + escapeXml(funderId) + "\" shortname=\"" + escapeXml(funderShortName) + "\" name=\"" + escapeXml(funderName)
+ "\" jurisdiction=\"" + escapeXml(funderJurisdiction) + "\" />";
}
}

View File

@ -0,0 +1,151 @@
package eu.dnetlib.dhp.graph.utils;
import eu.dnetlib.dhp.schema.oaf.*;
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix;
import static org.apache.commons.lang3.StringUtils.isBlank;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
public class XmlSerializationUtils {
// XML 1.0
// #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
private final static String xml10pattern = "[^"
+ "\u0009\r\n"
+ "\u0020-\uD7FF"
+ "\uE000-\uFFFD"
+ "\ud800\udc00-\udbff\udfff"
+ "]";
public static String mapJournal(Journal j) {
final String attrs = new StringBuilder()
.append(attr("issn", j.getIssnPrinted()))
.append(attr("eissn", j.getIssnOnline()))
.append(attr("lissn", j.getIssnLinking()))
.append(attr("ep", j.getEp()))
.append(attr("iss", j.getIss()))
.append(attr("sp", j.getSp()))
.append(attr("vol", j.getVol()))
.toString()
.trim();
return new StringBuilder()
.append("<journal")
.append(isNotBlank(attrs) ? (" " + attrs) : "")
.append(">")
.append(escapeXml(j.getName()))
.append("</journal>")
.toString();
}
private static String attr(final String name, final String value) {
return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : "";
}
public static String mapStructuredProperty(String name, StructuredProperty t) {
return asXmlElement(name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null);
}
public static String mapQualifier(String name, Qualifier q) {
return asXmlElement(name, "", q, null);
}
public static String escapeXml(final String value) {
return value
.replaceAll("&", "&amp;")
.replaceAll("<", "&lt;")
.replaceAll(">", "&gt;")
.replaceAll("\"", "&quot;")
.replaceAll("'", "&apos;")
.replaceAll(xml10pattern, "");
}
public static String parseDataInfo(final DataInfo dataInfo) {
return new StringBuilder()
.append("<datainfo>")
.append(asXmlElement("inferred", dataInfo.getInferred() + ""))
.append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + ""))
.append(asXmlElement("trust", dataInfo.getTrust() + ""))
.append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + ""))
.append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null))
.append("</datainfo>")
.toString();
}
private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) {
return sb
.append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : ""))
.append(attr("inferenceprovenance", info.getInferenceprovenance()))
.append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : ""))
.append(attr("trust", info.getTrust()));
}
public static String mapKeyValue(final String name, final KeyValue kv) {
return new StringBuilder()
.append("<")
.append(name)
.append(" name=\"")
.append(escapeXml(kv.getValue()))
.append("\" id=\"")
.append(escapeXml(removePrefix(kv.getKey())))
.append("\"/>")
.toString();
}
public static String mapExtraInfo(final ExtraInfo e) {
return new StringBuilder("<extraInfo ")
.append("name=\"" + e.getName() + "\" ")
.append("typology=\"" + e.getTypology() + "\" ")
.append("provenance=\"" + e.getProvenance() + "\" ")
.append("trust=\"" + e.getTrust() + "\"")
.append(">")
.append(e.getValue())
.append("</extraInfo>")
.toString();
}
public static String asXmlElement(final String name, final String value) {
return asXmlElement(name, value, null, null);
}
public static String asXmlElement(final String name, final String value, final Qualifier q, final DataInfo info) {
StringBuilder sb = new StringBuilder();
sb.append("<");
sb.append(name);
if (q != null) {
sb.append(getAttributes(q));
}
if (info != null) {
sb
.append(" ")
.append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : ""))
.append(attr("inferenceprovenance", info.getInferenceprovenance()))
.append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : ""))
.append(attr("trust", info.getTrust()));
}
if (isBlank(value)) {
sb.append("/>");
return sb.toString();
}
sb.append(">");
sb.append(escapeXml(value));
sb.append("</");
sb.append(name);
sb.append(">");
return sb.toString();
}
public static String getAttributes(final Qualifier q) {
if (q == null || q.isBlank()) return "";
return new StringBuilder(" ")
.append(attr("classid", q.getClassid()))
.append(attr("classname", q.getClassname()))
.append(attr("schemeid", q.getSchemeid()))
.append(attr("schemename", q.getSchemename()))
.toString();
}
}

View File

@ -0,0 +1 @@
net.sf.saxon.TransformerFactoryImpl

View File

@ -0,0 +1,6 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true}
]

View File

@ -0,0 +1,7 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read the XML records", "paramRequired": true},
{"paramName":"f", "paramLongName":"format", "paramDescription": "MDFormat name found in the IS profile", "paramRequired": true},
{"paramName":"b", "paramLongName":"batchSize", "paramDescription": "size of the batch of documents sent to solr", "paramRequired": false}
]

View File

@ -0,0 +1,34 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_db_name</name>
<value>openaire</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/applicationHistory</value>
</property>
</configuration>

View File

@ -0,0 +1,99 @@
<workflow-app name="index_infospace_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>hive_db_name</name>
<description>the target hive database name</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<start to="reuse_records"/>
<decision name="reuse_records">
<switch>
<case to="adjancency_lists">${wf:conf('reuseRecords') eq false}</case>
<case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
<default to="adjancency_lists"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="adjancency_lists">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn</master>
<mode>cluster</mode>
<name>build_adjacency_lists</name>
<class>eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>-mt</arg> <arg>yarn</arg>
<arg>-is</arg> <arg>${isLookupUrl}</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="to_solr_index"/>
<error to="Kill"/>
</action>
<action name="to_solr_index">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn</master>
<mode>cluster</mode>
<name>to_solr_index</name>
<class>eu.dnetlib.dhp.graph.SparkXmlIndexingJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>-mt</arg> <arg>yarn</arg>
<arg>-is</arg> <arg>${isLookupUrl}</arg>
<arg>--sourcePath</arg><arg>${outputPath}/xml</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--batchSize</arg><arg>${batchSize}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,3 @@
<name$if(hasId)$ objidentifier="$id$"$else$$endif>>
$metadata:{ it | $it$ }$
</name>

View File

@ -0,0 +1,10 @@
<oaf:$name$>
$metadata:{ it | $it$ }$
<rels>
$rels:{ it | $it$ }$
</rels>
<children>
$children:{ it | $it$ }$
</children>
</oaf:$name$>
$extrainfo:{ it | $it$ }$

View File

@ -0,0 +1,4 @@
<instance id="$instanceId$">
$metadata:{ it | $it$ }$
$webresources:{ it | $it$ }$
</instance>

View File

@ -0,0 +1,17 @@
<?xml version="1.0"?>
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>$id$</dri:objIdentifier>
<dri:dateOfCollection>$dateofcollection$</dri:dateOfCollection>
<dri:dateOfTransformation>$dateoftransformation$</dri:dateOfTransformation>
</header>
<metadata>
<oaf:entity xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xsi:schemaLocation="http://namespace.openaire.eu/oaf $schemaLocation$">
$it$
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -0,0 +1,4 @@
<rel inferred="$inferred$" trust="$trust$" inferenceprovenance="$inferenceprovenance$" provenanceaction="$provenanceaction$">
<to class="$class$" scheme="$scheme$" type="$type$">$objIdentifier$</to>
$metadata:{ it | $it$ }$
</rel>

View File

@ -0,0 +1,3 @@
<webresource>
<url>$identifier$</url>
</webresource>

View File

@ -0,0 +1,66 @@
package eu.dnetlib.dhp.graph;
import eu.dnetlib.dhp.graph.model.EntityRelEntity;
import eu.dnetlib.dhp.graph.model.RelatedEntity;
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
import org.codehaus.jackson.map.ObjectMapper;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStreamReader;
public class MappingUtilsTest {
private GraphMappingUtils utils;
@Before
public void setUp() {
utils = new GraphMappingUtils();
}
@Test
public void testOafMappingDatasource() throws IOException {
final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("datasource.json"));
final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class);
e.getSource().setType("datasource");
final EntityRelEntity out = utils.asRelatedEntity(e);
System.out.println(out);
}
//@Test
public void testOafMappingResult() throws IOException {
final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("result.json"));
final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class);
final EntityRelEntity out = utils.asRelatedEntity(e);
System.out.println(out);
}
@Test
public void testOafMappingSoftware() throws IOException {
final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("software.json"));
final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class);
final EntityRelEntity out = utils.asRelatedEntity(e);
System.out.println(out);
}
@Test
public void testParseRelatedEntity() throws IOException {
final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("related_entity.json"));
final RelatedEntity e = new ObjectMapper().readValue(in, RelatedEntity.class);
System.out.println(e);
}
}

View File

@ -0,0 +1,55 @@
package eu.dnetlib.dhp.graph;
import eu.dnetlib.dhp.graph.utils.ContextMapper;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.SparkSession;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class XmlRecordFactoryTest {
private static final Log log = LogFactory.getLog(XmlRecordFactoryTest.class);
private Path testDir;
@Before
public void setup() throws IOException {
testDir = Files.createTempDirectory(getClass().getSimpleName());
log.info("created test directory " + testDir.toString());
}
@After
public void tearDown() throws IOException {
FileUtils.deleteDirectory(testDir.toFile());
log.info("deleted test directory " + testDir.toString());
}
@Test
public void testXmlSerialization() throws Exception {
final SparkSession spark = SparkSession
.builder()
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
.master("local[*]")
.getOrCreate();
final String inputDir = testDir.toString() + "/3_joined_entities";
FileUtils.forceMkdir(new File(inputDir));
FileUtils.copyFile(new File("/Users/claudio/Downloads/joined_entities-part-00000"), new File(inputDir + "/joined_entities-part-00000"));
final ContextMapper ctx = ContextMapper.fromIS("https://dev-openaire.d4science.org:443/is/services/isLookUp");
final GraphJoiner g = new GraphJoiner(spark, ctx, inputDir, testDir.toString());
g.asXML();
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,5 @@
{
"id": "20|nih_________::6b8108b6d6399f7163a6a7ccdd0efc2d",
"type": "organization",
"legalname": "MCGILL UNIVERSITY"
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -10,9 +10,8 @@ This module is automatically executed when running:
on module having set:
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-wf</artifactId>
<version>1.0.0-SNAPSHOT</version>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
</parent>
in `pom.xml` file. `oozie-package` profile initializes oozie workflow packaging, `workflow.source.dir` property points to a workflow (notice: this is not a relative path but a classpath to directory usually holding `oozie_app` subdirectory).

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>
@ -20,6 +20,7 @@
<module>dhp-dedup</module>
<module>dhp-bulktag</module>
<module>dhp-propagation</module>
<module>dhp-graph-provision</module>
</modules>
<pluginRepositories>

114
pom.xml
View File

@ -1,9 +1,11 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.0.5-SNAPSHOT</version>
<version>1.1.6-SNAPSHOT</version>
<packaging>pom</packaging>
<url>http://www.d-net.research-infrastructures.eu</url>
@ -96,6 +98,12 @@
<version>${dhp.hadoop.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${dhp.hadoop.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
@ -149,7 +157,7 @@
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.5.1-5</version>
<version>9.9.1-6</version>
</dependency>
<dependency>
@ -170,6 +178,56 @@
<version>1.1.6</version>
</dependency>
<dependency>
<groupId>com.mycila.xmltool</groupId>
<artifactId>xmltool</artifactId>
<version>3.3</version>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
<version>7.5.0</version>
<exclusions>
<exclusion>
<artifactId>*</artifactId>
<groupId>*</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.lucidworks.spark</groupId>
<artifactId>spark-solr</artifactId>
<version>3.6.0</version>
<exclusions>
<exclusion>
<artifactId>*</artifactId>
<groupId>*</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.noggit</groupId>
<artifactId>noggit</artifactId>
<version>0.8</version>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.11</version>
</dependency>
<dependency>
<groupId>net.schmizz</groupId>
<artifactId>sshj</artifactId>
@ -200,10 +258,19 @@
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
<version>4.0.0-SNAPSHOT</version>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId>
<version>[2.0.0,3.0.0)</version>
</dependency>
<dependency>
<groupId>org.apache.cxf</groupId>
<artifactId>cxf-rt-transports-http</artifactId>
<version>3.1.5</version>
</dependency>
<dependency>
<groupId>javax.persistence</groupId>
<artifactId>javax.persistence-api</artifactId>
@ -231,6 +298,16 @@
<artifactId>secondstring</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
<version>${mongodb.driver.version}</version>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
<version>4.0</version>
</dependency>
<dependency>
<groupId>org.apache.oozie</groupId>
@ -349,31 +426,7 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
<extensions>
@ -421,6 +474,7 @@
<dhp.jackson.version>2.9.6</dhp.jackson.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<scala.version>2.11.12</scala.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
</properties>
</project>