forked from D-Net/dnet-hadoop
synch fork with master
This commit is contained in:
commit
5868ff8a86
|
@ -1,6 +1,8 @@
|
|||
.DS_Store
|
||||
.idea
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
*~
|
||||
.classpath
|
||||
/*/.classpath
|
||||
|
@ -18,5 +20,5 @@
|
|||
/*/build
|
||||
/build
|
||||
spark-warehouse
|
||||
/*/*/job-override.properties
|
||||
/**/job-override.properties
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
|
@ -76,6 +76,41 @@
|
|||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
|
||||
<plugin>
|
||||
<groupId>org.eclipse.m2e</groupId>
|
||||
<artifactId>lifecycle-mapping</artifactId>
|
||||
<version>1.0.0</version>
|
||||
<configuration>
|
||||
<lifecycleMappingMetadata>
|
||||
<pluginExecutions>
|
||||
<pluginExecution>
|
||||
<pluginExecutionFilter>
|
||||
<groupId>
|
||||
org.apache.maven.plugins
|
||||
</groupId>
|
||||
<artifactId>
|
||||
maven-plugin-plugin
|
||||
</artifactId>
|
||||
<versionRange>
|
||||
[3.2,)
|
||||
</versionRange>
|
||||
<goals>
|
||||
<goal>descriptor</goal>
|
||||
</goals>
|
||||
</pluginExecutionFilter>
|
||||
<action>
|
||||
<ignore></ignore>
|
||||
</action>
|
||||
</pluginExecution>
|
||||
</pluginExecutions>
|
||||
</lifecycleMappingMetadata>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
@ -42,6 +42,10 @@
|
|||
<groupId>com.rabbitmq</groupId>
|
||||
<artifactId>amqp-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.lib.ExtensionFunctionCall;
|
||||
import net.sf.saxon.lib.ExtensionFunctionDefinition;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.om.StructuredQName;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
|
||||
public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
|
||||
|
||||
public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
|
||||
|
||||
public abstract String getName();
|
||||
public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException;
|
||||
|
||||
@Override
|
||||
public StructuredQName getFunctionQName() {
|
||||
return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExtensionFunctionCall makeCallExpression() {
|
||||
return new ExtensionFunctionCall() {
|
||||
@Override
|
||||
public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
return doCall(context, arguments);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Item;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.GregorianCalendar;
|
||||
|
||||
public class ExtractYear extends AbstractExtensionFunction {
|
||||
|
||||
private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" };
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "extractYear";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
final Item item = arguments[0].head();
|
||||
if (item == null) {
|
||||
return new StringValue("");
|
||||
}
|
||||
return new StringValue(_year(item.getStringValue()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
private String _year(String s) {
|
||||
Calendar c = new GregorianCalendar();
|
||||
for (String format : dateFormats) {
|
||||
try {
|
||||
c.setTime(new SimpleDateFormat(format).parse(s));
|
||||
String year = String.valueOf(c.get(Calendar.YEAR));
|
||||
return year;
|
||||
} catch (ParseException e) {}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
public class NormalizeDate extends AbstractExtensionFunction {
|
||||
|
||||
private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" };
|
||||
|
||||
private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "normalizeDate";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
String s = arguments[0].head().getStringValue();
|
||||
return new StringValue(_year(s));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
private String _year(String s) {
|
||||
final String date = s != null ? s.trim() : "";
|
||||
|
||||
for (String format : normalizeDateFormats) {
|
||||
try {
|
||||
Date parse = new SimpleDateFormat(format).parse(date);
|
||||
String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
|
||||
return res;
|
||||
} catch (ParseException e) {}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Item;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class PickFirst extends AbstractExtensionFunction {
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "pickFirst";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
|
||||
final String s1 = getValue(arguments[0]);
|
||||
final String s2 = getValue(arguments[1]);
|
||||
|
||||
return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
|
||||
}
|
||||
|
||||
private String getValue(final Sequence arg) throws XPathException {
|
||||
if (arg != null) {
|
||||
final Item item = arg.head();
|
||||
if (item != null) {
|
||||
return item.getStringValue();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 2;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.Configuration;
|
||||
import net.sf.saxon.TransformerFactoryImpl;
|
||||
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class SaxonTransformerFactory {
|
||||
|
||||
/**
|
||||
* Creates the index record transformer from the given XSLT
|
||||
* @param xslt
|
||||
* @return
|
||||
* @throws TransformerException
|
||||
*/
|
||||
public static Transformer newInstance(final String xslt) throws TransformerException {
|
||||
|
||||
final TransformerFactoryImpl factory = new TransformerFactoryImpl();
|
||||
final Configuration conf = factory.getConfiguration();
|
||||
conf.registerExtensionFunction(new ExtractYear());
|
||||
conf.registerExtensionFunction(new NormalizeDate());
|
||||
conf.registerExtensionFunction(new PickFirst());
|
||||
|
||||
return factory.newTransformer(new StreamSource(new StringReader(xslt)));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,3 +1,11 @@
|
|||
Description of the project
|
||||
--------------------------
|
||||
This project defines **serialization schemas** of Avro data store files that are used to pass data between workflow nodes in the system.
|
||||
This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them.
|
||||
Namely it defines the model for
|
||||
|
||||
- **research product (result)** which subclasses in publication, dataset, other research product, software
|
||||
- **data source** object describing the data provider (institutional repository, aggregators, cris systems)
|
||||
- **organization** research bodies managing a data source or participating to a research project
|
||||
- **project** research project
|
||||
|
||||
Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline.
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
@ -26,6 +26,11 @@
|
|||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
|
|
|
@ -1,118 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Entity implements Serializable {
|
||||
|
||||
private String identifier;
|
||||
|
||||
private List<Pid> pid;
|
||||
|
||||
private List<String> title;
|
||||
|
||||
private List<String> date;
|
||||
|
||||
private String typology;
|
||||
|
||||
private List<String> authors;
|
||||
|
||||
private List<Subject> subject;
|
||||
|
||||
private String description;
|
||||
|
||||
private String completionStatus;
|
||||
|
||||
private List<Provenance> collectedFrom;
|
||||
|
||||
private List<String> publisher;
|
||||
|
||||
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
public void setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
public List<Pid> getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(List<Pid> pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public List<String> getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(List<String> title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public List<String> getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(List<String> date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public String getTypology() {
|
||||
return typology;
|
||||
}
|
||||
|
||||
public void setTypology(String typology) {
|
||||
this.typology = typology;
|
||||
}
|
||||
|
||||
public List<String> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<String> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
|
||||
public List<Subject> getSubject() {
|
||||
return subject;
|
||||
}
|
||||
|
||||
public void setSubject(List<Subject> subject) {
|
||||
this.subject = subject;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public List<Provenance> getCollectedFrom() {
|
||||
return collectedFrom;
|
||||
}
|
||||
|
||||
public void setCollectedFrom(List<Provenance> collectedFrom) {
|
||||
this.collectedFrom = collectedFrom;
|
||||
}
|
||||
|
||||
public List<String> getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(List<String> publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class Pid {
|
||||
|
||||
private String pid;
|
||||
|
||||
private String pidType;
|
||||
|
||||
public String getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(String pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public String getPidType() {
|
||||
return pidType;
|
||||
}
|
||||
|
||||
public void setPidType(String pidType) {
|
||||
this.pidType = pidType;
|
||||
}
|
||||
|
||||
public String generateId() {
|
||||
if(StringUtils.isEmpty(pid) || StringUtils.isEmpty(pidType))
|
||||
return null;
|
||||
return DHPUtils.md5(String.format("%s::%s", pid, pidType));
|
||||
}
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
public class Provenance {
|
||||
|
||||
private String datasourceId;
|
||||
|
||||
private String datasourceName;
|
||||
|
||||
private String completionStatus;
|
||||
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
public String getDatasourceName() {
|
||||
return datasourceName;
|
||||
}
|
||||
|
||||
public void setDatasourceName(String datasourceName) {
|
||||
this.datasourceName = datasourceName;
|
||||
}
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
}
|
|
@ -1,47 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Relation implements Serializable {
|
||||
|
||||
private String source;
|
||||
|
||||
private String target;
|
||||
|
||||
private List<Provenance> provenance;
|
||||
|
||||
private RelationSemantic semantic;
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public String getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public void setTarget(String target) {
|
||||
this.target = target;
|
||||
}
|
||||
|
||||
public List<Provenance> getProvenance() {
|
||||
return provenance;
|
||||
}
|
||||
|
||||
public void setProvenance(List<Provenance> provenance) {
|
||||
this.provenance = provenance;
|
||||
}
|
||||
|
||||
public RelationSemantic getSemantic() {
|
||||
return semantic;
|
||||
}
|
||||
|
||||
public void setSemantic(RelationSemantic semantic) {
|
||||
this.semantic = semantic;
|
||||
}
|
||||
}
|
|
@ -1,16 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class RelationSemantic extends Subject implements Serializable {
|
||||
|
||||
public String inverse;
|
||||
|
||||
public String getInverse() {
|
||||
return inverse;
|
||||
}
|
||||
|
||||
public void setInverse(String inverse) {
|
||||
this.inverse = inverse;
|
||||
}
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
package eu.dnetlib.dhp.schema.dli;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Subject implements Serializable {
|
||||
|
||||
private String schema;
|
||||
|
||||
private String value;
|
||||
|
||||
public Subject() {
|
||||
|
||||
}
|
||||
|
||||
public Subject(String schema, String value) {
|
||||
this.schema = schema;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public String getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
public void setSchema(String schema) {
|
||||
this.schema = schema;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
public class Country extends Qualifier {
|
||||
|
||||
private DataInfo dataInfo;
|
||||
|
||||
public DataInfo getDataInfo() {
|
||||
return dataInfo;
|
||||
}
|
||||
|
||||
public void setDataInfo(DataInfo dataInfo) {
|
||||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
}
|
|
@ -40,9 +40,9 @@ public class Datasource extends OafEntity implements Serializable {
|
|||
|
||||
private List<Field<String>> odlanguages;
|
||||
|
||||
private List< Field<String>> odcontenttypes;
|
||||
private List<Field<String>> odcontenttypes;
|
||||
|
||||
private List< Field<String>> accessinfopackage;
|
||||
private List<Field<String>> accessinfopackage;
|
||||
|
||||
// re3data fields
|
||||
private Field<String> releasestartdate;
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -36,7 +37,7 @@ public class GeoLocation implements Serializable {
|
|||
this.place = place;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(point) &&
|
||||
StringUtils.isBlank(box) &&
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Instance implements Serializable {
|
||||
|
||||
|
@ -12,7 +13,7 @@ public class Instance implements Serializable {
|
|||
|
||||
private KeyValue hostedby;
|
||||
|
||||
private String url;
|
||||
private List<String> url;
|
||||
|
||||
// other research products specifc
|
||||
private String distributionlocation;
|
||||
|
@ -21,6 +22,14 @@ public class Instance implements Serializable {
|
|||
|
||||
private Field<String> dateofacceptance;
|
||||
|
||||
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargeamount;
|
||||
|
||||
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private Field<String> refereed; //peer-review status
|
||||
|
||||
public Field<String> getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
@ -53,11 +62,11 @@ public class Instance implements Serializable {
|
|||
this.hostedby = hostedby;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
public List<String> getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public void setUrl(String url) {
|
||||
public void setUrl(List<String> url) {
|
||||
this.url = url;
|
||||
}
|
||||
|
||||
|
@ -85,7 +94,29 @@ public class Instance implements Serializable {
|
|||
this.dateofacceptance = dateofacceptance;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargeamount() {
|
||||
return processingchargeamount;
|
||||
}
|
||||
|
||||
public void setProcessingchargeamount(Field<String> processingchargeamount) {
|
||||
this.processingchargeamount = processingchargeamount;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargecurrency() {
|
||||
return processingchargecurrency;
|
||||
}
|
||||
|
||||
public void setProcessingchargecurrency(Field<String> processingchargecurrency) {
|
||||
this.processingchargecurrency = processingchargecurrency;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
return refereed;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
this.refereed = refereed;
|
||||
}
|
||||
|
||||
public String toComparableString(){
|
||||
return String.format("%s::%s::%s::%s",
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -40,6 +41,7 @@ public class KeyValue implements Serializable {
|
|||
return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(key) && StringUtils.isBlank(value);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -50,6 +51,8 @@ public class Qualifier implements Serializable {
|
|||
schemeid != null ? schemeid : "",
|
||||
schemename != null ? schemename : "");
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(classid) &&
|
||||
StringUtils.isBlank(classname) &&
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
public abstract class Result extends OafEntity implements Serializable {
|
||||
|
||||
|
@ -16,7 +14,7 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
// common fields
|
||||
private Qualifier language;
|
||||
|
||||
private List<Qualifier> country;
|
||||
private List<Country> country;
|
||||
|
||||
private List<StructuredProperty> subject;
|
||||
|
||||
|
@ -44,16 +42,10 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
|
||||
private List<Field<String>> coverage;
|
||||
|
||||
private Field<String> refereed; //peer-review status
|
||||
private Qualifier bestaccessright;
|
||||
|
||||
private List<Context> context;
|
||||
|
||||
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargeamount;
|
||||
|
||||
// currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private List<ExternalReference> externalReference;
|
||||
|
||||
private List<Instance> instance;
|
||||
|
@ -82,11 +74,11 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.language = language;
|
||||
}
|
||||
|
||||
public List<Qualifier> getCountry() {
|
||||
public List<Country> getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(List<Qualifier> country) {
|
||||
public void setCountry(List<Country> country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
|
@ -194,12 +186,12 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.coverage = coverage;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
return refereed;
|
||||
public Qualifier getBestaccessright() {
|
||||
return bestaccessright;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
this.refereed = refereed;
|
||||
public void setBestaccessright(Qualifier bestaccessright) {
|
||||
this.bestaccessright = bestaccessright;
|
||||
}
|
||||
|
||||
public List<Context> getContext() {
|
||||
|
@ -226,24 +218,6 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
this.instance = instance;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargeamount() {
|
||||
return processingchargeamount;
|
||||
}
|
||||
|
||||
public Result setProcessingchargeamount(Field<String> processingchargeamount) {
|
||||
this.processingchargeamount = processingchargeamount;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargecurrency() {
|
||||
return processingchargecurrency;
|
||||
}
|
||||
|
||||
public Result setProcessingchargecurrency(Field<String> processingchargecurrency) {
|
||||
this.processingchargecurrency = processingchargecurrency;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mergeFrom(OafEntity e) {
|
||||
super.mergeFrom(e);
|
||||
|
@ -287,19 +261,9 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
|
||||
coverage = mergeLists(coverage, r.getCoverage());
|
||||
|
||||
if (r.getRefereed() != null && compareTrust(this, r) < 0)
|
||||
refereed = r.getRefereed();
|
||||
|
||||
context = mergeLists(context, r.getContext());
|
||||
|
||||
if (r.getProcessingchargeamount() != null && compareTrust(this, r) < 0)
|
||||
processingchargeamount = r.getProcessingchargeamount();
|
||||
|
||||
if (r.getProcessingchargecurrency() != null && compareTrust(this, r) < 0)
|
||||
processingchargecurrency = r.getProcessingchargecurrency();
|
||||
|
||||
externalReference = mergeLists(externalReference, r.getExternalReference());
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -314,5 +278,4 @@ public abstract class Result extends OafEntity implements Serializable {
|
|||
return a.size() > b.size() ? a : b;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
|
@ -26,6 +26,12 @@
|
|||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
|
@ -45,6 +51,17 @@
|
|||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.postgresql</groupId>
|
||||
<artifactId>postgresql</artifactId>
|
||||
<version>42.2.10</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-core</artifactId>
|
||||
|
|
|
@ -0,0 +1,236 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class AbstractMigrationExecutor implements Closeable {
|
||||
|
||||
private final AtomicInteger counter = new AtomicInteger(0);
|
||||
|
||||
private final Text key = new Text();
|
||||
|
||||
private final Text value = new Text();
|
||||
|
||||
private final ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
private final SequenceFile.Writer writer;
|
||||
|
||||
private static final Log log = LogFactory.getLog(AbstractMigrationExecutor.class);
|
||||
|
||||
public AbstractMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser) throws Exception {
|
||||
|
||||
log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s, nameNode=%s, user=%s", hdfsPath, hdfsNameNode, hdfsUser));
|
||||
|
||||
this.writer = SequenceFile.createWriter(getConf(hdfsNameNode, hdfsUser), SequenceFile.Writer.file(new Path(hdfsPath)), SequenceFile.Writer
|
||||
.keyClass(Text.class), SequenceFile.Writer.valueClass(Text.class));
|
||||
}
|
||||
|
||||
private Configuration getConf(final String hdfsNameNode, final String hdfsUser) throws IOException {
|
||||
final Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
System.setProperty("HADOOP_USER_NAME", hdfsUser);
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
FileSystem.get(URI.create(hdfsNameNode), conf);
|
||||
return conf;
|
||||
}
|
||||
|
||||
protected void emitOaf(final Oaf oaf) {
|
||||
try {
|
||||
key.set(counter.getAndIncrement() + ":" + oaf.getClass().getSimpleName().toLowerCase());
|
||||
value.set(objectMapper.writeValueAsString(oaf));
|
||||
writer.append(key, value);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.hflush();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public static KeyValue keyValue(final String k, final String v) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
kv.setKey(k);
|
||||
kv.setValue(v);
|
||||
return kv;
|
||||
}
|
||||
|
||||
public static List<KeyValue> listKeyValues(final String... s) {
|
||||
if (s.length % 2 > 0) { throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); }
|
||||
|
||||
final List<KeyValue> list = new ArrayList<>();
|
||||
for (int i = 0; i < s.length; i += 2) {
|
||||
list.add(keyValue(s[i], s[i + 1]));
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public static <T> Field<T> field(final T value, final DataInfo info) {
|
||||
if (value == null || StringUtils.isBlank(value.toString())) { return null; }
|
||||
|
||||
final Field<T> field = new Field<>();
|
||||
field.setValue(value);
|
||||
field.setDataInfo(info);
|
||||
return field;
|
||||
}
|
||||
|
||||
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
|
||||
return Arrays.stream(values).map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
|
||||
return values.stream().map(v -> field(v, info)).filter(Objects::nonNull).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static Qualifier qualifier(final String classid, final String classname, final String schemeid, final String schemename) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(classid);
|
||||
q.setClassname(classname);
|
||||
q.setSchemeid(schemeid);
|
||||
q.setSchemename(schemename);
|
||||
return q;
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(final String value,
|
||||
final String classid,
|
||||
final String classname,
|
||||
final String schemeid,
|
||||
final String schemename,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
|
||||
}
|
||||
|
||||
public static StructuredProperty structuredProperty(final String value, final Qualifier qualifier, final DataInfo dataInfo) {
|
||||
if (value == null) { return null; }
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(value);
|
||||
sp.setQualifier(qualifier);
|
||||
sp.setDataInfo(dataInfo);
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static ExtraInfo extraInfo(final String name, final String value, final String typology, final String provenance, final String trust) {
|
||||
final ExtraInfo info = new ExtraInfo();
|
||||
info.setName(name);
|
||||
info.setValue(value);
|
||||
info.setTypology(typology);
|
||||
info.setProvenance(provenance);
|
||||
info.setTrust(trust);
|
||||
return info;
|
||||
}
|
||||
|
||||
public static OAIProvenance oaiIProvenance(final String identifier,
|
||||
final String baseURL,
|
||||
final String metadataNamespace,
|
||||
final Boolean altered,
|
||||
final String datestamp,
|
||||
final String harvestDate) {
|
||||
|
||||
final OriginDescription desc = new OriginDescription();
|
||||
desc.setIdentifier(identifier);
|
||||
desc.setBaseURL(baseURL);
|
||||
desc.setMetadataNamespace(metadataNamespace);
|
||||
desc.setAltered(altered);
|
||||
desc.setDatestamp(datestamp);
|
||||
desc.setHarvestDate(harvestDate);
|
||||
|
||||
final OAIProvenance p = new OAIProvenance();
|
||||
p.setOriginDescription(desc);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
public static Journal journal(final String name,
|
||||
final String issnPrinted,
|
||||
final String issnOnline,
|
||||
final String issnLinking,
|
||||
final String ep,
|
||||
final String iss,
|
||||
final String sp,
|
||||
final String vol,
|
||||
final String edition,
|
||||
final String conferenceplace,
|
||||
final String conferencedate,
|
||||
final DataInfo dataInfo) {
|
||||
|
||||
if (StringUtils.isNotBlank(name) || StringUtils.isNotBlank(issnPrinted) || StringUtils.isNotBlank(issnOnline) || StringUtils.isNotBlank(issnLinking)) {
|
||||
final Journal j = new Journal();
|
||||
j.setName(name);
|
||||
j.setIssnPrinted(issnPrinted);
|
||||
j.setIssnOnline(issnOnline);
|
||||
j.setIssnLinking(issnLinking);
|
||||
j.setEp(ep);
|
||||
j.setIss(iss);
|
||||
j.setSp(sp);
|
||||
j.setVol(vol);
|
||||
j.setEdition(edition);
|
||||
j.setConferenceplace(conferenceplace);
|
||||
j.setConferencedate(conferencedate);
|
||||
j.setDataInfo(dataInfo);
|
||||
return j;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static DataInfo dataInfo(final Boolean deletedbyinference,
|
||||
final String inferenceprovenance,
|
||||
final Boolean inferred,
|
||||
final Boolean invisible,
|
||||
final Qualifier provenanceaction,
|
||||
final String trust) {
|
||||
final DataInfo d = new DataInfo();
|
||||
d.setDeletedbyinference(deletedbyinference);
|
||||
d.setInferenceprovenance(inferenceprovenance);
|
||||
d.setInferred(inferred);
|
||||
d.setInvisible(invisible);
|
||||
d.setProvenanceaction(provenanceaction);
|
||||
d.setTrust(trust);
|
||||
return d;
|
||||
}
|
||||
|
||||
public static String createOpenaireId(final int prefix, final String originalId) {
|
||||
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
|
||||
final String rest = StringUtils.substringAfter(originalId, "::");
|
||||
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
|
||||
|
||||
}
|
||||
|
||||
public static String asString(final Object o) {
|
||||
return o == null ? "" : o.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,439 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentFactory;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
||||
|
||||
protected final Map<String, String> code2name = new HashMap<>();
|
||||
|
||||
protected final MdstoreClient mdstoreClient;
|
||||
|
||||
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||
|
||||
protected static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER =
|
||||
qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies");
|
||||
protected static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies");
|
||||
protected static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
|
||||
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
|
||||
|
||||
private static final Log log = LogFactory.getLog(AbstractMongoExecutor.class);
|
||||
|
||||
public AbstractMongoExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl,
|
||||
final String mongoDb, final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
|
||||
super(hdfsPath, hdfsNameNode, hdfsUser);
|
||||
|
||||
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
||||
loadClassNames(dbUrl, dbUser, dbPassword);
|
||||
|
||||
final Map<String, String> nsContext = new HashMap<>();
|
||||
|
||||
registerNamespaces(nsContext);
|
||||
|
||||
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
|
||||
}
|
||||
|
||||
private void loadClassNames(final String dbUrl, final String dbUser, final String dbPassword) throws IOException {
|
||||
|
||||
log.info("Loading vocabulary terms from db...");
|
||||
|
||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
code2name.clear();
|
||||
dbClient.processResults("select code, name from class", rs -> {
|
||||
try {
|
||||
code2name.put(rs.getString("code"), rs.getString("name"));
|
||||
} catch (final SQLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
log.info("Found " + code2name.size() + " terms.");
|
||||
|
||||
}
|
||||
|
||||
public void processMdRecords(final String mdFormat, final String mdLayout, final String mdInterpretation) throws DocumentException {
|
||||
|
||||
log.info(String.format("Searching mdstores (format: %s, layout: %s, interpretation: %s)", mdFormat, mdLayout, mdInterpretation));
|
||||
|
||||
final Map<String, String> colls = mdstoreClient.validCollections(mdFormat, mdLayout, mdInterpretation);
|
||||
log.info("Found " + colls.size() + " mdstores");
|
||||
|
||||
for (final Entry<String, String> entry : colls.entrySet()) {
|
||||
log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")");
|
||||
final String currentColl = entry.getValue();
|
||||
|
||||
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
||||
final Document doc = DocumentHelper.parseText(xml);
|
||||
|
||||
final String type = doc.valueOf("//dr:CobjCategory/@type");
|
||||
final KeyValue collectedFrom = keyValue(doc.valueOf("//oaf:collectedFrom/@id"), doc.valueOf("//oaf:collectedFrom/@name"));
|
||||
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
|
||||
: keyValue(doc.valueOf("//oaf:hostedBy/@id"), doc.valueOf("//oaf:hostedBy/@name"));
|
||||
|
||||
final DataInfo info = prepareDataInfo(doc);
|
||||
final long lastUpdateTimestamp = new Date().getTime();
|
||||
|
||||
for (final Oaf oaf : createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp)) {
|
||||
emitOaf(oaf);
|
||||
}
|
||||
}
|
||||
}
|
||||
log.info("All Done.");
|
||||
}
|
||||
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
|
||||
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
|
||||
nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
|
||||
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
|
||||
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
|
||||
}
|
||||
|
||||
protected List<Oaf> createOafs(final Document doc,
|
||||
final String type,
|
||||
final KeyValue collectedFrom,
|
||||
final KeyValue hostedBy,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
|
||||
final List<Oaf> oafs = new ArrayList<>();
|
||||
|
||||
switch (type.toLowerCase()) {
|
||||
case "":
|
||||
case "publication":
|
||||
final Publication p = new Publication();
|
||||
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
p.setResulttype(PUBLICATION_RESULTTYPE_QUALIFIER);
|
||||
p.setJournal(prepareJournal(doc, info));
|
||||
oafs.add(p);
|
||||
break;
|
||||
case "dataset":
|
||||
final Dataset d = new Dataset();
|
||||
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
d.setResulttype(DATASET_RESULTTYPE_QUALIFIER);
|
||||
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
||||
d.setDevice(prepareDatasetDevice(doc, info));
|
||||
d.setSize(prepareDatasetSize(doc, info));
|
||||
d.setVersion(prepareDatasetVersion(doc, info));
|
||||
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
|
||||
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
|
||||
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
|
||||
oafs.add(d);
|
||||
break;
|
||||
case "software":
|
||||
final Software s = new Software();
|
||||
populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
s.setResulttype(SOFTWARE_RESULTTYPE_QUALIFIER);
|
||||
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
||||
s.setLicense(prepareSoftwareLicenses(doc, info));
|
||||
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
||||
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
|
||||
oafs.add(s);
|
||||
break;
|
||||
case "otherresearchproducts":
|
||||
default:
|
||||
final OtherResearchProduct o = new OtherResearchProduct();
|
||||
populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
o.setResulttype(OTHER_RESULTTYPE_QUALIFIER);
|
||||
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
||||
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
||||
o.setTool(prepareOtherResearchProductTools(doc, info));
|
||||
oafs.add(o);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!oafs.isEmpty()) {
|
||||
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
|
||||
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
|
||||
}
|
||||
|
||||
return oafs;
|
||||
}
|
||||
|
||||
private List<Oaf> addProjectRels(final Document doc,
|
||||
final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||
|
||||
for (final Object o : doc.selectNodes("//oaf:projectid")) {
|
||||
final String projectId = createOpenaireId(40, ((Node) o).getText());
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("resultProject");
|
||||
r1.setSubRelType("outcome");
|
||||
r1.setRelClass("isProducedBy");
|
||||
r1.setSource(docId);
|
||||
r1.setTarget(projectId);
|
||||
r1.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
res.add(r1);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType("resultProject");
|
||||
r2.setSubRelType("outcome");
|
||||
r2.setRelClass("produces");
|
||||
r2.setSource(projectId);
|
||||
r2.setTarget(docId);
|
||||
r2.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
res.add(r2);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
protected abstract List<Oaf> addOtherResultRels(final Document doc,
|
||||
final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp);
|
||||
|
||||
private void populateResultFields(final Result r,
|
||||
final Document doc,
|
||||
final KeyValue collectedFrom,
|
||||
final KeyValue hostedBy,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
r.setDataInfo(info);
|
||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier")));
|
||||
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
|
||||
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
||||
r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
|
||||
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
|
||||
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
|
||||
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setOaiprovenance(prepareOAIprovenance(doc));
|
||||
r.setAuthor(prepareAuthors(doc, info));
|
||||
r.setLanguage(prepareLanguages(doc));
|
||||
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setSubject(prepareSubjects(doc, info));
|
||||
r.setTitle(prepareTitles(doc, info));
|
||||
r.setRelevantdate(prepareRelevantDates(doc, info));
|
||||
r.setDescription(prepareDescriptions(doc, info));
|
||||
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
|
||||
r.setPublisher(preparePublisher(doc, info));
|
||||
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
||||
r.setSource(prepareSources(doc, info));
|
||||
r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setFormat(prepareFormats(doc, info));
|
||||
r.setContributor(prepareContributors(doc, info));
|
||||
r.setResourcetype(prepareResourceType(doc, info));
|
||||
r.setCoverage(prepareCoverages(doc, info));
|
||||
r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
|
||||
}
|
||||
|
||||
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
|
||||
|
||||
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> preparePublisher(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Qualifier prepareLanguages(Document doc);
|
||||
|
||||
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareOtherResearchProductTools(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info);
|
||||
|
||||
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||
|
||||
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
||||
final Node n = doc.selectSingleNode("//oaf:journal");
|
||||
if (n != null) {
|
||||
final String name = n.getText();
|
||||
final String issnPrinted = n.valueOf("@issn");
|
||||
final String issnOnline = n.valueOf("@eissn");
|
||||
final String issnLinking = n.valueOf("@lissn");
|
||||
final String ep = n.valueOf("@ep");
|
||||
final String iss = n.valueOf("@iss");
|
||||
final String sp = n.valueOf("@sp");
|
||||
final String vol = n.valueOf("@vol");
|
||||
final String edition = n.valueOf("@edition");
|
||||
if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); }
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected Qualifier prepareQualifier(final Node node, final String xpath, final String schemeId, final String schemeName) {
|
||||
final String classId = node.valueOf(xpath);
|
||||
final String className = code2name.get(classId);
|
||||
return qualifier(classId, className, schemeId, schemeName);
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node,
|
||||
final String xpath,
|
||||
final String xpathClassId,
|
||||
final String schemeId,
|
||||
final String schemeName,
|
||||
final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
final String classId = n.valueOf(xpathClassId);
|
||||
final String className = code2name.get(classId);
|
||||
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
res.add(structuredProperty(n.getText(), qualifier, info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected List<StructuredProperty> prepareListStructProps(final Node node, final String xpath, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final Node n = (Node) o;
|
||||
res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n
|
||||
.valueOf("@schemename"), info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected OAIProvenance prepareOAIprovenance(final Document doc) {
|
||||
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
|
||||
|
||||
if (n == null) { return null; }
|
||||
|
||||
final String identifier = n.valueOf("./*[local-name()='identifier']");
|
||||
final String baseURL = n.valueOf("./*[local-name()='baseURL']");;
|
||||
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");;
|
||||
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
|
||||
final String datestamp = n.valueOf("./*[local-name()='datestamp']");;
|
||||
final String harvestDate = n.valueOf("@harvestDate");;
|
||||
|
||||
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
|
||||
|
||||
}
|
||||
|
||||
protected DataInfo prepareDataInfo(final Document doc) {
|
||||
final Node n = doc.selectSingleNode("//oaf:datainfo");
|
||||
|
||||
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
|
||||
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
|
||||
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
|
||||
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
|
||||
|
||||
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
|
||||
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
|
||||
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
|
||||
final String trust = n.valueOf("./oaf:trust");
|
||||
|
||||
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
||||
}
|
||||
|
||||
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
|
||||
return field(node.valueOf(xpath), info);
|
||||
}
|
||||
|
||||
protected List<Field<String>> prepareListFields(final Node node, final String xpath, final DataInfo info) {
|
||||
return listFields(info, prepareListString(node, xpath));
|
||||
}
|
||||
|
||||
protected List<String> prepareListString(final Node node, final String xpath) {
|
||||
final List<String> res = new ArrayList<>();
|
||||
for (final Object o : node.selectNodes(xpath)) {
|
||||
final String s = ((Node) o).getText().trim();
|
||||
if (StringUtils.isNotBlank(s)) {
|
||||
res.add(s);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
mdstoreClient.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
public class DbClient implements Closeable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DbClient.class);
|
||||
|
||||
private Connection connection;
|
||||
|
||||
public DbClient(final String address, final String login, final String password) {
|
||||
|
||||
try {
|
||||
Class.forName("org.postgresql.Driver");
|
||||
|
||||
this.connection =
|
||||
StringUtils.isNoneBlank(login, password) ? DriverManager.getConnection(address, login, password) : DriverManager.getConnection(address);
|
||||
this.connection.setAutoCommit(false);
|
||||
} catch (final Exception e) {
|
||||
log.error(e.getClass().getName() + ": " + e.getMessage());
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
log.info("Opened database successfully");
|
||||
}
|
||||
|
||||
public void processResults(final String sql, final Consumer<ResultSet> consumer) {
|
||||
|
||||
try (final Statement stmt = connection.createStatement()) {
|
||||
stmt.setFetchSize(100);
|
||||
|
||||
try (final ResultSet rs = stmt.executeQuery(sql)) {
|
||||
while (rs.next()) {
|
||||
consumer.accept(rs);
|
||||
}
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} catch (final SQLException e1) {
|
||||
throw new RuntimeException(e1);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
try {
|
||||
connection.close();
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class ExtractEntitiesFromHDFSJob {
|
||||
|
||||
|
||||
private static List<String> folderNames = Arrays.asList("db_entities", "oaf_entities", "odf_entities");
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/extract_entities_from_hdfs_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(ExtractEntitiesFromHDFSJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String targetPath = parser.get("graphRawPath");
|
||||
final String entity = parser.get("entity");
|
||||
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
|
||||
JavaRDD<String> inputRdd = sc.emptyRDD();
|
||||
|
||||
|
||||
folderNames.forEach(p -> inputRdd.union(
|
||||
sc.sequenceFile(sourcePath+"/"+p, Text.class, Text.class)
|
||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||
.filter(k -> isEntityType(k._1(), entity))
|
||||
.map(Tuple2::_2))
|
||||
);
|
||||
|
||||
inputRdd.saveAsTextFile(targetPath+"/"+entity);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isEntityType(final String item, final String entity) {
|
||||
return StringUtils.substringAfter(item, ":").equalsIgnoreCase(entity);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.bson.Document;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.mongodb.MongoClient;
|
||||
import com.mongodb.MongoClientURI;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
|
||||
public class MdstoreClient implements Closeable {
|
||||
|
||||
private final MongoClient client;
|
||||
private final MongoDatabase db;
|
||||
|
||||
private static final String COLL_METADATA = "metadata";
|
||||
private static final String COLL_METADATA_MANAGER = "metadataManager";
|
||||
|
||||
private static final Log log = LogFactory.getLog(MdstoreClient.class);
|
||||
|
||||
public MdstoreClient(final String baseUrl, final String dbName) {
|
||||
this.client = new MongoClient(new MongoClientURI(baseUrl));
|
||||
this.db = getDb(client, dbName);
|
||||
}
|
||||
|
||||
public Map<String, String> validCollections(final String mdFormat, final String mdLayout, final String mdInterpretation) {
|
||||
|
||||
final Map<String, String> transactions = new HashMap<>();
|
||||
for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) {
|
||||
final String mdId = entry.getString("mdId");
|
||||
final String currentId = entry.getString("currentId");
|
||||
if (StringUtils.isNoneBlank(mdId, currentId)) {
|
||||
transactions.put(mdId, currentId);
|
||||
}
|
||||
}
|
||||
|
||||
final Map<String, String> res = new HashMap<>();
|
||||
for (final Document entry : getColl(db, COLL_METADATA, true).find()) {
|
||||
if (entry.getString("format").equals(mdFormat) && entry.getString("layout").equals(mdLayout)
|
||||
&& entry.getString("interpretation").equals(mdInterpretation) && transactions.containsKey(entry.getString("mdId"))) {
|
||||
res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId")));
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private MongoDatabase getDb(final MongoClient client, final String dbName) {
|
||||
if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
|
||||
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
|
||||
log.warn(err);
|
||||
throw new RuntimeException(err);
|
||||
}
|
||||
return client.getDatabase(dbName);
|
||||
}
|
||||
|
||||
private MongoCollection<Document> getColl(final MongoDatabase db, final String collName, final boolean abortIfMissing) {
|
||||
if (!Iterables.contains(db.listCollectionNames(), collName)) {
|
||||
final String err = String.format(String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
|
||||
log.warn(err);
|
||||
if (abortIfMissing) {
|
||||
throw new RuntimeException(err);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return db.getCollection(collName);
|
||||
}
|
||||
|
||||
public Iterable<String> listRecords(final String collName) {
|
||||
final MongoCollection<Document> coll = getColl(db, collName, false);
|
||||
return coll == null ? new ArrayList<>()
|
||||
: () -> StreamSupport.stream(coll.find().spliterator(), false)
|
||||
.filter(e -> e.containsKey("body"))
|
||||
.map(e -> e.getString("body"))
|
||||
.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
client.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,520 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.sql.Array;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class MigrateDbEntitiesApplication extends AbstractMigrationExecutor implements Closeable {
|
||||
|
||||
private static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION =
|
||||
qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenance_actions", "dnet:provenance_actions");
|
||||
|
||||
private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class);
|
||||
|
||||
private final DbClient dbClient;
|
||||
|
||||
private final long lastUpdateTimestamp;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("namenode");
|
||||
final String hdfsUser = parser.get("hdfsUser");
|
||||
|
||||
try (final MigrateDbEntitiesApplication smdbe = new MigrateDbEntitiesApplication(hdfsPath, hdfsNameNode, hdfsUser, dbUrl, dbUser, dbPassword)) {
|
||||
log.info("Processing datasources...");
|
||||
smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
|
||||
|
||||
log.info("Processing projects...");
|
||||
smdbe.execute("queryProjects.sql", smdbe::processProject);
|
||||
|
||||
log.info("Processing orgs...");
|
||||
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);
|
||||
|
||||
log.info("Processing relations ds <-> orgs ...");
|
||||
smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization);
|
||||
|
||||
log.info("Processing projects <-> orgs ...");
|
||||
smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization);
|
||||
|
||||
log.info("All done.");
|
||||
}
|
||||
}
|
||||
|
||||
public MigrateDbEntitiesApplication(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
super(hdfsPath, hdfsNameNode, hdfsUser);
|
||||
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
|
||||
this.lastUpdateTimestamp = new Date().getTime();
|
||||
}
|
||||
|
||||
public void execute(final String sqlFile, final Consumer<ResultSet> consumer) throws Exception {
|
||||
final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile));
|
||||
dbClient.processResults(sql, consumer);
|
||||
}
|
||||
|
||||
public void processDatasource(final ResultSet rs) {
|
||||
|
||||
try {
|
||||
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
|
||||
final Datasource ds = new Datasource();
|
||||
|
||||
ds.setId(createOpenaireId(10, rs.getString("datasourceid")));
|
||||
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
|
||||
ds.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
ds.setPid(new ArrayList<>());
|
||||
ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
|
||||
ds.setDateoftransformation(null); // Value not returned by the SQL query
|
||||
ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB
|
||||
ds.setOaiprovenance(null); // Values not present in the DB
|
||||
ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype")));
|
||||
ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility")));
|
||||
ds.setOfficialname(field(rs.getString("officialname"), info));
|
||||
ds.setEnglishname(field(rs.getString("englishname"), info));
|
||||
ds.setWebsiteurl(field(rs.getString("websiteurl"), info));
|
||||
ds.setLogourl(field(rs.getString("logourl"), info));
|
||||
ds.setContactemail(field(rs.getString("contactemail"), info));
|
||||
ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info));
|
||||
ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info));
|
||||
ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info));
|
||||
ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info));
|
||||
ds.setDescription(field(rs.getString("description"), info));
|
||||
ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
|
||||
ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info));
|
||||
ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info));
|
||||
ds.setOdpolicies(field(rs.getString("odpolicies"), info));
|
||||
ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info));
|
||||
ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info));
|
||||
ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info));
|
||||
ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info));
|
||||
ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info));
|
||||
ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info));
|
||||
ds.setDataprovider(field(rs.getBoolean("dataprovider"), info));
|
||||
ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info));
|
||||
ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info));
|
||||
ds.setDatauploadtype(field(rs.getString("datauploadtype"), info));
|
||||
ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info));
|
||||
ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info));
|
||||
ds.setVersioning(field(rs.getBoolean("versioning"), info));
|
||||
ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info));
|
||||
ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info));
|
||||
ds.setPidsystems(field(rs.getString("pidsystems"), info));
|
||||
ds.setCertificates(field(rs.getString("certificates"), info));
|
||||
ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array
|
||||
ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal
|
||||
ds.setDataInfo(info);
|
||||
ds.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
// rs.getString("datasourceid");
|
||||
// rs.getArray("identities");
|
||||
// rs.getString("officialname");
|
||||
// rs.getString("englishname");
|
||||
// rs.getString("contactemail");
|
||||
// rs.getString("openairecompatibility"); // COMPLEX ...@@@...
|
||||
// rs.getString("websiteurl");
|
||||
// rs.getString("logourl");
|
||||
// rs.getArray("accessinfopackage");
|
||||
// rs.getDouble("latitude");
|
||||
// rs.getDouble("longitude");
|
||||
// rs.getString("namespaceprefix");
|
||||
// rs.getInt("odnumberofitems"); // NULL
|
||||
// rs.getDate("odnumberofitemsdate"); // NULL
|
||||
// rs.getArray("subjects");
|
||||
// rs.getString("description");
|
||||
// rs.getString("odpolicies"); // NULL
|
||||
// rs.getArray("odlanguages");
|
||||
// rs.getArray("odcontenttypes");
|
||||
// rs.getBoolean("inferred"); // false
|
||||
// rs.getBoolean("deletedbyinference");// false
|
||||
// rs.getDouble("trust"); // 0.9
|
||||
// rs.getString("inferenceprovenance"); // NULL
|
||||
// rs.getDate("dateofcollection");
|
||||
// rs.getDate("dateofvalidation");
|
||||
// rs.getDate("releasestartdate");
|
||||
// rs.getDate("releaseenddate");
|
||||
// rs.getString("missionstatementurl");
|
||||
// rs.getBoolean("dataprovider");
|
||||
// rs.getBoolean("serviceprovider");
|
||||
// rs.getString("databaseaccesstype");
|
||||
// rs.getString("datauploadtype");
|
||||
// rs.getString("databaseaccessrestriction");
|
||||
// rs.getString("datauploadrestriction");
|
||||
// rs.getBoolean("versioning");
|
||||
// rs.getString("citationguidelineurl");
|
||||
// rs.getString("qualitymanagementkind");
|
||||
// rs.getString("pidsystems");
|
||||
// rs.getString("certificates");
|
||||
// rs.getArray("policies");
|
||||
// rs.getString("collectedfromid");
|
||||
// rs.getString("collectedfromname");
|
||||
// rs.getString("datasourcetype"); // COMPLEX
|
||||
// rs.getString("provenanceaction"); //
|
||||
// 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions'
|
||||
// AS provenanceaction,
|
||||
// rs.getString("journal"); // CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
|
||||
|
||||
emitOaf(ds);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void processProject(final ResultSet rs) {
|
||||
try {
|
||||
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
|
||||
final Project p = new Project();
|
||||
|
||||
p.setId(createOpenaireId(40, rs.getString("projectid")));
|
||||
p.setOriginalId(Arrays.asList(rs.getString("projectid")));
|
||||
p.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
p.setPid(new ArrayList<>());
|
||||
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
|
||||
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
|
||||
p.setExtraInfo(new ArrayList<>()); // Values not present in the DB
|
||||
p.setOaiprovenance(null); // Values not present in the DB
|
||||
p.setWebsiteurl(field(rs.getString("websiteurl"), info));
|
||||
p.setCode(field(rs.getString("code"), info));
|
||||
p.setAcronym(field(rs.getString("acronym"), info));
|
||||
p.setTitle(field(rs.getString("title"), info));
|
||||
p.setStartdate(field(asString(rs.getDate("startdate")), info));
|
||||
p.setEnddate(field(asString(rs.getDate("enddate")), info));
|
||||
p.setCallidentifier(field(rs.getString("callidentifier"), info));
|
||||
p.setKeywords(field(rs.getString("keywords"), info));
|
||||
p.setDuration(field(Integer.toString(rs.getInt("duration")), info));
|
||||
p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info));
|
||||
p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info));
|
||||
p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info));
|
||||
p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info));
|
||||
p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info));
|
||||
p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype")));
|
||||
p.setOptional1(field(rs.getString("optional1"), info));
|
||||
p.setOptional2(field(rs.getString("optional2"), info));
|
||||
p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info));
|
||||
p.setContactfullname(field(rs.getString("contactfullname"), info));
|
||||
p.setContactfax(field(rs.getString("contactfax"), info));
|
||||
p.setContactphone(field(rs.getString("contactphone"), info));
|
||||
p.setContactemail(field(rs.getString("contactemail"), info));
|
||||
p.setSummary(field(rs.getString("summary"), info));
|
||||
p.setCurrency(field(rs.getString("currency"), info));
|
||||
p.setTotalcost(new Float(rs.getDouble("totalcost")));
|
||||
p.setFundedamount(new Float(rs.getDouble("fundedamount")));
|
||||
p.setDataInfo(info);
|
||||
p.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
// rs.getString("projectid");
|
||||
// rs.getString("code");
|
||||
// rs.getString("websiteurl");
|
||||
// rs.getString("acronym");
|
||||
// rs.getString("title");
|
||||
// rs.getDate("startdate");
|
||||
// rs.getDate("enddate");
|
||||
// rs.getString("callidentifier");
|
||||
// rs.getString("keywords");
|
||||
// rs.getInt("duration");
|
||||
// rs.getBoolean("ecsc39");
|
||||
// rs.getBoolean("oamandatepublications");
|
||||
// rs.getBoolean("ecarticle29_3");
|
||||
// rs.getDate("dateofcollection");
|
||||
// rs.getDate("dateoftransformation");
|
||||
// rs.getBoolean("inferred");
|
||||
// rs.getBoolean("deletedbyinference");
|
||||
// rs.getDouble("trust");
|
||||
// rs.getString("inferenceprovenance");
|
||||
// rs.getString("optional1");
|
||||
// rs.getString("optional2");
|
||||
// rs.getString("jsonextrainfo");
|
||||
// rs.getString("contactfullname");
|
||||
// rs.getString("contactfax");
|
||||
// rs.getString("contactphone");
|
||||
// rs.getString("contactemail");
|
||||
// rs.getString("summary");
|
||||
// rs.getString("currency");
|
||||
// rs.getDouble("totalcost");
|
||||
// rs.getDouble("fundedamount");
|
||||
// rs.getString("collectedfromid");
|
||||
// rs.getString("collectedfromname");
|
||||
// rs.getString("contracttype"); // COMPLEX
|
||||
// rs.getString("provenanceaction"); // COMPLEX
|
||||
// rs.getArray("pid");
|
||||
// rs.getArray("subjects");
|
||||
// rs.getArray("fundingtree");
|
||||
|
||||
emitOaf(p);
|
||||
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void processOrganization(final ResultSet rs) {
|
||||
|
||||
try {
|
||||
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
|
||||
final Organization o = new Organization();
|
||||
|
||||
o.setId(createOpenaireId(20, rs.getString("organizationid")));
|
||||
o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
|
||||
o.setCollectedfrom(listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname")));
|
||||
o.setPid(new ArrayList<>());
|
||||
o.setDateofcollection(asString(rs.getDate("dateofcollection")));
|
||||
o.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
|
||||
o.setExtraInfo(new ArrayList<>()); // Values not present in the DB
|
||||
o.setOaiprovenance(null); // Values not present in the DB
|
||||
o.setLegalshortname(field("legalshortname", info));
|
||||
o.setLegalname(field("legalname", info));
|
||||
o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query
|
||||
o.setWebsiteurl(field("websiteurl", info));
|
||||
o.setLogourl(field("logourl", info));
|
||||
o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info));
|
||||
o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info));
|
||||
o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info));
|
||||
o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info));
|
||||
o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info));
|
||||
o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info));
|
||||
o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info));
|
||||
o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info));
|
||||
o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info));
|
||||
o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info));
|
||||
o.setCountry(prepareQualifierSplitting(rs.getString("country")));
|
||||
o.setDataInfo(info);
|
||||
o.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
// rs.getString("organizationid");
|
||||
// rs.getString("legalshortname");
|
||||
// rs.getString("legalname");
|
||||
// rs.getString("websiteurl");
|
||||
// rs.getString("logourl");
|
||||
// rs.getBoolean("eclegalbody");
|
||||
// rs.getBoolean("eclegalperson");
|
||||
// rs.getBoolean("ecnonprofit");
|
||||
// rs.getBoolean("ecresearchorganization");
|
||||
// rs.getBoolean("echighereducation");
|
||||
// rs.getBoolean("ecinternationalorganizationeurinterests");
|
||||
// rs.getBoolean("ecinternationalorganization");
|
||||
// rs.getBoolean("ecenterprise");
|
||||
// rs.getBoolean("ecsmevalidated");
|
||||
// rs.getBoolean("ecnutscode");
|
||||
// rs.getDate("dateofcollection");
|
||||
// rs.getDate("dateoftransformation");
|
||||
// rs.getBoolean("inferred");
|
||||
// rs.getBoolean("deletedbyinference");
|
||||
// rs.getDouble("trust");
|
||||
// rs.getString("inferenceprovenance");
|
||||
// rs.getString("collectedfromid");
|
||||
// rs.getString("collectedfromname");
|
||||
// rs.getString("country");
|
||||
// rs.getString("provenanceaction");
|
||||
// rs.getArray("pid");
|
||||
|
||||
emitOaf(o);
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void processDatasourceOrganization(final ResultSet rs) {
|
||||
|
||||
try {
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
final String orgId = createOpenaireId(20, rs.getString("organization"));
|
||||
final String dsId = createOpenaireId(10, rs.getString("datasource"));
|
||||
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("datasourceOrganization");
|
||||
r1.setSubRelType("provision");
|
||||
r1.setRelClass("isProvidedBy");
|
||||
r1.setSource(dsId);
|
||||
r1.setTarget(orgId);
|
||||
r1.setCollectedFrom(collectedFrom);
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
emitOaf(r1);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType("datasourceOrganization");
|
||||
r2.setSubRelType("provision");
|
||||
r2.setRelClass("provides");
|
||||
r2.setSource(orgId);
|
||||
r2.setTarget(dsId);
|
||||
r2.setCollectedFrom(collectedFrom);
|
||||
r2.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
emitOaf(r2);
|
||||
|
||||
// rs.getString("datasource");
|
||||
// rs.getString("organization");
|
||||
// rs.getDate("startdate"); // NULL
|
||||
// rs.getDate("enddate"); // NULL
|
||||
// rs.getBoolean("inferred"); // false
|
||||
// rs.getBoolean("deletedbyinference"); // false
|
||||
// rs.getDouble("trust"); // 0.9
|
||||
// rs.getString("inferenceprovenance"); // NULL
|
||||
// rs.getString("semantics"); // 'providedBy@@@provided
|
||||
// by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS
|
||||
// semantics,
|
||||
// rs.getString("provenanceaction"); // d.provenanceaction || '@@@' || d.provenanceaction ||
|
||||
// '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
|
||||
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void processProjectOrganization(final ResultSet rs) {
|
||||
|
||||
try {
|
||||
final DataInfo info = prepareDataInfo(rs);
|
||||
final String orgId = createOpenaireId(20, rs.getString("resporganization"));
|
||||
final String projectId = createOpenaireId(40, rs.getString("project"));
|
||||
final List<KeyValue> collectedFrom = listKeyValues(rs.getString("collectedfromid"), rs.getString("collectedfromname"));
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("projectOrganization");
|
||||
r1.setSubRelType("participation");
|
||||
r1.setRelClass("isParticipant");
|
||||
r1.setSource(projectId);
|
||||
r1.setTarget(orgId);
|
||||
r1.setCollectedFrom(collectedFrom);
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
emitOaf(r1);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType("projectOrganization");
|
||||
r2.setSubRelType("participation");
|
||||
r2.setRelClass("hasParticipant");
|
||||
r2.setSource(orgId);
|
||||
r2.setTarget(projectId);
|
||||
r2.setCollectedFrom(collectedFrom);
|
||||
r2.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
emitOaf(r2);
|
||||
|
||||
// rs.getString("project");
|
||||
// rs.getString("resporganization");
|
||||
// rs.getInt("participantnumber");
|
||||
// rs.getDouble("contribution");
|
||||
// rs.getDate("startdate");// null
|
||||
// rs.getDate("enddate");// null
|
||||
// rs.getBoolean("inferred");// false
|
||||
// rs.getBoolean("deletedbyinference"); // false
|
||||
// rs.getDouble("trust");
|
||||
// rs.getString("inferenceprovenance"); // NULL
|
||||
// rs.getString("semantics"); // po.semanticclass || '@@@' || po.semanticclass ||
|
||||
// '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
|
||||
// rs.getString("provenanceaction"); //
|
||||
// 'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions'
|
||||
// AS provenanceaction
|
||||
|
||||
} catch (final Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException {
|
||||
final Boolean deletedbyinference = rs.getBoolean("deletedbyinference");
|
||||
final String inferenceprovenance = rs.getString("inferenceprovenance");
|
||||
final Boolean inferred = rs.getBoolean("inferred");
|
||||
final String trust = rs.getString("trust");
|
||||
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, ENTITYREGISTRY_PROVENANCE_ACTION, trust);
|
||||
}
|
||||
|
||||
private Qualifier prepareQualifierSplitting(final String s) {
|
||||
if (StringUtils.isBlank(s)) { return null; }
|
||||
final String[] arr = s.split("@@@");
|
||||
return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null;
|
||||
}
|
||||
|
||||
private List<Field<String>> prepareListFields(final Array array, final DataInfo info) {
|
||||
try {
|
||||
return listFields(info, (String[]) array.getArray());
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException("Invalid SQL array", e);
|
||||
}
|
||||
}
|
||||
|
||||
private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) {
|
||||
if (StringUtils.isBlank(s)) { return null; }
|
||||
final String[] parts = s.split("###");
|
||||
if (parts.length == 2) {
|
||||
final String value = parts[0];
|
||||
final String[] arr = parts[1].split("@@@");
|
||||
if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); }
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<StructuredProperty> prepareListOfStructProps(final Array array, final DataInfo dataInfo) throws SQLException {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
if (array != null) {
|
||||
for (final String s : (String[]) array.getArray()) {
|
||||
final StructuredProperty sp = prepareStructProp(s, dataInfo);
|
||||
if (sp != null) {
|
||||
res.add(sp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private Journal prepareJournal(final String name, final String sj, final DataInfo info) {
|
||||
if (StringUtils.isNotBlank(sj)) {
|
||||
final String[] arr = sj.split("@@@");
|
||||
if (arr.length == 3) {
|
||||
final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null;
|
||||
final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;;
|
||||
final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;;
|
||||
if (issn != null || eissn != null
|
||||
|| lissn != null) { return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); }
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
dbClient.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class MigrateMongoMdstoresApplication {
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String mongoBaseUrl = parser.get("mongoBaseUrl");
|
||||
final String mongoDb = parser.get("mongoDb");
|
||||
|
||||
final String mdFormat = parser.get("mdFormat");
|
||||
final String mdLayout = parser.get("mdLayout");
|
||||
final String mdInterpretation = parser.get("mdInterpretation");
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("namenode");
|
||||
final String hdfsUser = parser.get("hdfsUser");
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
|
||||
if (mdFormat.equalsIgnoreCase("oaf")) {
|
||||
try (final OafMigrationExecutor mig =
|
||||
new OafMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
|
||||
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
|
||||
}
|
||||
} else if (mdFormat.equalsIgnoreCase("odf")) {
|
||||
try (final OdfMigrationExecutor mig =
|
||||
new OdfMigrationExecutor(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword)) {
|
||||
mig.processMdRecords(mdFormat, mdLayout, mdInterpretation);
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException("Format not supported: " + mdFormat);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,251 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.migration.pace.PacePerson;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class OafMigrationExecutor extends AbstractMongoExecutor {
|
||||
|
||||
private static final Log log = LogFactory.getLog(OafMigrationExecutor.class);
|
||||
|
||||
public OafMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
|
||||
final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
super.registerNamespaces(nsContext);
|
||||
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
||||
final List<Author> res = new ArrayList<>();
|
||||
int pos = 1;
|
||||
for (final Object o : doc.selectNodes("//dc:creator")) {
|
||||
final Node n = (Node) o;
|
||||
final Author author = new Author();
|
||||
author.setFullname(n.getText());
|
||||
author.setRank(pos++);
|
||||
final PacePerson p = new PacePerson(n.getText(), false);
|
||||
if (p.isAccurate()) {
|
||||
author.setName(p.getNormalisedFirstName());
|
||||
author.setSurname(p.getNormalisedSurname());
|
||||
}
|
||||
res.add(author);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareLanguages(final Document doc) {
|
||||
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:subject", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:description", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:publisher", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:format", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributor", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:coverage", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
|
||||
final List<Instance> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:identifier")) {
|
||||
final String url = ((Node) o).getText().trim();
|
||||
if (url.startsWith("http")) {
|
||||
final Instance instance = new Instance();
|
||||
instance.setUrl(Arrays.asList(url));
|
||||
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
res.add(instance);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:source", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
// SOFTWARES
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
// DATASETS
|
||||
@Override
|
||||
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
// OTHER PRODUCTS
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Oaf> addOtherResultRels(final Document doc,
|
||||
final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp) {
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
|
||||
final String otherId = createOpenaireId(50, ((Node) o).getText());
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
r1.setRelType("resultResult");
|
||||
r1.setSubRelType("publicationDataset");
|
||||
r1.setRelClass("isRelatedTo");
|
||||
r1.setSource(docId);
|
||||
r1.setTarget(otherId);
|
||||
r1.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
res.add(r1);
|
||||
|
||||
final Relation r2 = new Relation();
|
||||
r2.setRelType("resultResult");
|
||||
r2.setSubRelType("publicationDataset");
|
||||
r2.setRelClass("isRelatedTo");
|
||||
r2.setSource(otherId);
|
||||
r2.setTarget(docId);
|
||||
r2.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
res.add(r2);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||
return null; // NOT PRESENT IN OAF
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,273 @@
|
|||
package eu.dnetlib.dhp.migration;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
||||
|
||||
private static final Log log = LogFactory.getLog(OdfMigrationExecutor.class);
|
||||
|
||||
public OdfMigrationExecutor(final String hdfsPath, final String hdfsNameNode, final String hdfsUser, final String mongoBaseUrl, final String mongoDb,
|
||||
final String dbUrl, final String dbUser,
|
||||
final String dbPassword) throws Exception {
|
||||
super(hdfsPath, hdfsNameNode, hdfsUser, mongoBaseUrl, mongoDb, dbUrl, dbUser, dbPassword);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void registerNamespaces(final Map<String, String> nsContext) {
|
||||
super.registerNamespaces(nsContext);
|
||||
nsContext.put("dc", "http://datacite.org/schema/kernel-3");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
||||
final List<Author> res = new ArrayList<>();
|
||||
int pos = 1;
|
||||
for (final Object o : doc.selectNodes("//dc:creator")) {
|
||||
final Node n = (Node) o;
|
||||
final Author author = new Author();
|
||||
author.setFullname(n.valueOf("./dc:creatorName"));
|
||||
author.setName(n.valueOf("./dc:givenName"));
|
||||
author.setSurname(n.valueOf("./dc:familyName"));
|
||||
author.setAffiliation(prepareListFields(doc, "./dc:affiliation", info));
|
||||
author.setPid(preparePids(doc, info));
|
||||
author.setRank(pos++);
|
||||
res.add(author);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private List<StructuredProperty> preparePids(final Document doc, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("./dc:nameIdentifier")) {
|
||||
res.add(structuredProperty(((Node) o).getText(), prepareQualifier((Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), info));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> prepareInstances(final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) {
|
||||
final List<Instance> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:alternateIdentifier[@alternateIdentifierType='URL']")) {
|
||||
final Instance instance = new Instance();
|
||||
instance.setUrl(Arrays.asList(((Node) o).getText().trim()));
|
||||
instance.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource"));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance.setAccessright(prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes"));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
res.add(instance);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||
final List<StructuredProperty> res = new ArrayList<>();
|
||||
for (final Object o : doc.selectNodes("//dc:date")) {
|
||||
final String dateType = ((Node) o).valueOf("@dateType");
|
||||
if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued")
|
||||
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) {
|
||||
res.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", "dnet:dataCite_date", "dnet:dataCite_date", info));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:format", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:publisher", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:description[@descriptionType='Abstract']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
||||
return prepareListStructProps(doc, "//dc:subject", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareLanguages(final Document doc) {
|
||||
return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactGroup']/dc:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:contributor[@contributorType='ContactPerson']/dc:contributorName", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||
return prepareQualifier(doc, "//dc:format", "dnet:programming_languages", "dnet:programming_languages");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
|
||||
return null; // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
|
||||
return new ArrayList<>(); // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
||||
return prepareListFields(doc, "//dc:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
|
||||
}
|
||||
|
||||
// DATASETS
|
||||
|
||||
@Override
|
||||
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
||||
final List<GeoLocation> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//dc:geoLocation")) {
|
||||
final GeoLocation loc = new GeoLocation();
|
||||
loc.setBox(((Node) o).valueOf("./dc:geoLocationBox"));
|
||||
loc.setPlace(((Node) o).valueOf("./dc:geoLocationPlace"));
|
||||
loc.setPoint(((Node) o).valueOf("./dc:geoLocationPoint"));
|
||||
res.add(loc);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
||||
return null; // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:date[@dateType='Updated']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:version", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:size", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
||||
return null; // Not present in ODF ???
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
||||
return prepareField(doc, "//dc:date[@dateType='Issued']", info);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
|
||||
|
||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||
|
||||
final List<Oaf> res = new ArrayList<>();
|
||||
|
||||
for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) {
|
||||
final String otherId = createOpenaireId(50, ((Node) o).getText());
|
||||
final String type = ((Node) o).valueOf("@relationType");
|
||||
|
||||
if (type.equals("IsSupplementTo")) {
|
||||
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo"));
|
||||
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy"));
|
||||
} else if (type.equals("IsPartOf")) {
|
||||
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf"));
|
||||
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts"));
|
||||
} else {}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private Relation prepareOtherResultRel(final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp,
|
||||
final String source,
|
||||
final String target,
|
||||
final String subRelType,
|
||||
final String relClass) {
|
||||
final Relation r = new Relation();
|
||||
r.setRelType("resultResult");
|
||||
r.setSubRelType(subRelType);
|
||||
r.setRelClass(relClass);
|
||||
r.setSource(source);
|
||||
r.setTarget(target);
|
||||
r.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||
r.setDataInfo(info);
|
||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||
return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
package eu.dnetlib.dhp.migration.pace;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
public class PacePerson {
|
||||
|
||||
private static final String UTF8 = "UTF-8";
|
||||
private List<String> name = Lists.newArrayList();
|
||||
private List<String> surname = Lists.newArrayList();
|
||||
private List<String> fullname = Lists.newArrayList();
|
||||
private final String original;
|
||||
|
||||
private static Set<String> particles = null;
|
||||
|
||||
public static final String capitalize(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
|
||||
}
|
||||
|
||||
public static final String dotAbbreviations(final String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
final Set<String> h = new HashSet<>();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
|
||||
h.add(s);
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
public PacePerson(String s, final boolean aggressive) {
|
||||
original = s;
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
s = s.replaceAll("\\(.+\\)", "");
|
||||
s = s.replaceAll("\\[.+\\]", "");
|
||||
s = s.replaceAll("\\{.+\\}", "");
|
||||
s = s.replaceAll("\\s+-\\s+", "-");
|
||||
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
|
||||
s = s.replaceAll("\\d", " ");
|
||||
s = s.replaceAll("\\n", " ");
|
||||
s = s.replaceAll("\\.", " ");
|
||||
s = s.replaceAll("\\s+", " ");
|
||||
|
||||
if (aggressive) {
|
||||
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
|
||||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||
}
|
||||
|
||||
if (s.contains(",")) {
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
fullname = splitTerms(arr[0]);
|
||||
} else if (arr.length > 1) {
|
||||
surname = splitTerms(arr[0]);
|
||||
name = splitTerms(arr[1]);
|
||||
fullname.addAll(surname);
|
||||
fullname.addAll(name);
|
||||
}
|
||||
} else {
|
||||
fullname = splitTerms(s);
|
||||
|
||||
int lastInitialPosition = fullname.size();
|
||||
boolean hasSurnameInUpperCase = false;
|
||||
|
||||
for (int i = 0; i < fullname.size(); i++) {
|
||||
final String term = fullname.get(i);
|
||||
if (term.length() == 1) {
|
||||
lastInitialPosition = i;
|
||||
} else if (term.equals(term.toUpperCase())) {
|
||||
hasSurnameInUpperCase = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||
name = fullname.subList(0, lastInitialPosition + 1);
|
||||
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
|
||||
for (final String term : fullname) {
|
||||
if (term.length() > 1 && term.equals(term.toUpperCase())) {
|
||||
surname.add(term);
|
||||
} else {
|
||||
name.add(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> splitTerms(final String s) {
|
||||
if (particles == null) {
|
||||
particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
|
||||
}
|
||||
|
||||
final List<String> list = Lists.newArrayList();
|
||||
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||
if (!particles.contains(part.toLowerCase())) {
|
||||
list.add(part);
|
||||
}
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public List<String> getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getNameString() {
|
||||
return Joiner.on(" ").join(getName());
|
||||
}
|
||||
|
||||
public List<String> getSurname() {
|
||||
return surname;
|
||||
}
|
||||
|
||||
public List<String> getFullname() {
|
||||
return fullname;
|
||||
}
|
||||
|
||||
public String getOriginal() {
|
||||
return original;
|
||||
}
|
||||
|
||||
public String hash() {
|
||||
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
|
||||
}
|
||||
|
||||
public String getNormalisedFirstName() {
|
||||
return Joiner.on(" ").join(getCapitalFirstnames());
|
||||
}
|
||||
|
||||
public String getNormalisedSurname() {
|
||||
return Joiner.on(" ").join(getCapitalSurname());
|
||||
}
|
||||
|
||||
public String getSurnameString() {
|
||||
return Joiner.on(" ").join(getSurname());
|
||||
}
|
||||
|
||||
public String getNormalisedFullname() {
|
||||
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
|
||||
}
|
||||
|
||||
public List<String> getCapitalFirstnames() {
|
||||
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
|
||||
}
|
||||
|
||||
public List<String> getCapitalSurname() {
|
||||
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
|
||||
}
|
||||
|
||||
public List<String> getNameWithAbbreviations() {
|
||||
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
|
||||
}
|
||||
|
||||
public boolean isAccurate() {
|
||||
return name != null && surname != null && !name.isEmpty() && !surname.isEmpty();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the HDFS source path which contains the sequential file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphRawPath",
|
||||
"paramDescription": "the path of the graph Raw in hdfs",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "e",
|
||||
"paramLongName": "entity",
|
||||
"paramDescription": "The entity to extract",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,38 @@
|
|||
[
|
||||
{
|
||||
"paramName": "p",
|
||||
"paramLongName": "hdfsPath",
|
||||
"paramDescription": "the path where storing the sequential file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
"paramDescription": "the Name Node URI",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "hdfsUser",
|
||||
"paramDescription": "the user wich create the hdfs seq file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dburl",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dbuser",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "postgres user",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "dbpasswd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "postgres password",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,68 @@
|
|||
[
|
||||
{
|
||||
"paramName": "p",
|
||||
"paramLongName": "hdfsPath",
|
||||
"paramDescription": "the path where storing the sequential file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
"paramDescription": "the Name Node URI",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "hdfsUser",
|
||||
"paramDescription": "the user wich create the hdfs seq file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mongourl",
|
||||
"paramLongName": "mongoBaseUrl",
|
||||
"paramDescription": "mongoDB url, example: mongodb://[username:password@]host[:port]",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "db",
|
||||
"paramLongName": "mongoDb",
|
||||
"paramDescription": "mongo database",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "f",
|
||||
"paramLongName": "mdFormat",
|
||||
"paramDescription": "metadata format",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "l",
|
||||
"paramLongName": "mdLayout",
|
||||
"paramDescription": "metadata layout",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "mdInterpretation",
|
||||
"paramDescription": "metadata interpretation",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pgurl",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pguser",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "postgres user",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pgpasswd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "postgres password",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,22 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hdfsUser</name>
|
||||
<value>dnet</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,282 @@
|
|||
<workflow-app name="import Entities from aggretor to HDFS" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the base path to store hdfs file</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>graphRawPath</name>
|
||||
<description>the graph Raw base path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongourl</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath}'/>
|
||||
<mkdir path='${workingPath}'/>
|
||||
</fs>
|
||||
<ok to="ImportEntitiesFromPostgres"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportEntitiesFromPostgres">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/db_entities</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-u</arg><arg>${hdfsUser}</arg>
|
||||
<arg>-dburl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-dbuser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-dbpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportODFEntitiesFromMongoDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODFEntitiesFromMongoDB">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/odf_entities</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-u</arg><arg>${hdfsUser}</arg>
|
||||
<arg>-mongourl</arg><arg>${mongourl}</arg>
|
||||
<arg>-db</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportOAFEntitiesFromMongoDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAFEntitiesFromMongoDB">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.migration.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/oaf_entities</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-u</arg><arg>${hdfsUser}</arg>
|
||||
<arg>-mongourl</arg><arg>${mongourl}</arg>
|
||||
<arg>-db</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ExtractPublication"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractPublication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: publication</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/publication</arg>
|
||||
<arg>-e</arg><arg>publication</arg>
|
||||
</spark>
|
||||
<ok to="ExtractDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: dataset</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/dataset</arg>
|
||||
<arg>-e</arg><arg>dataset</arg>
|
||||
</spark>
|
||||
<ok to="ExtractSoftware"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractSoftware">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: software</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/software</arg>
|
||||
<arg>-e</arg><arg>software</arg>
|
||||
</spark>
|
||||
<ok to="ExtractORP"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractORP">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: otherresearchproduct</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/otherresearchproduct</arg>
|
||||
<arg>-e</arg><arg>otherresearchproduct</arg>
|
||||
</spark>
|
||||
<ok to="ExtractDatasource"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractDatasource">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: datasource</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/datasource</arg>
|
||||
<arg>-e</arg><arg>datasource</arg>
|
||||
</spark>
|
||||
<ok to="ExtractOrganization"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractOrganization">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: organization</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/organization</arg>
|
||||
<arg>-e</arg><arg>organization</arg>
|
||||
</spark>
|
||||
<ok to="ExtractProject"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractProject">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: project</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/project</arg>
|
||||
<arg>-e</arg><arg>project</arg>
|
||||
</spark>
|
||||
<ok to="ExtractRelation"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ExtractRelation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractEntities: relation</name>
|
||||
<class>eu.dnetlib.dhp.migration.ExtractEntitiesFromHDFSJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf spark.sql.warehouse.dir="/user/hive/warehouse"</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>-g</arg><arg>${graphRawPath}/relation</arg>
|
||||
<arg>-e</arg><arg>relation</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,7 @@
|
|||
van
|
||||
der
|
||||
de
|
||||
dell
|
||||
sig
|
||||
mr
|
||||
mrs
|
|
@ -0,0 +1,17 @@
|
|||
SELECT
|
||||
dor.datasource AS datasource,
|
||||
dor.organization AS organization,
|
||||
NULL AS startdate,
|
||||
NULL AS enddate,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.9 AS trust,
|
||||
NULL AS inferenceprovenance,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
'providedBy@@@provided by@@@dnet:datasources_organizations_typologies@@@dnet:datasources_organizations_typologies' AS semantics,
|
||||
d.provenanceaction || '@@@' || d.provenanceaction || '@@@dnet:provenanceActions@@@dnet:provenanceActions' AS provenanceaction
|
||||
|
||||
FROM dsm_datasource_organization dor
|
||||
LEFT OUTER JOIN dsm_datasources d ON (dor.datasource = d.id)
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = d.collectedfrom)
|
|
@ -0,0 +1,147 @@
|
|||
SELECT
|
||||
d.id AS datasourceid,
|
||||
d.id || array_agg(distinct di.pid) AS identities,
|
||||
d.officialname AS officialname,
|
||||
d.englishname AS englishname,
|
||||
d.contactemail AS contactemail,
|
||||
CASE
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['openaire-cris_1.1'])
|
||||
THEN
|
||||
'openaire-cris_1.1@@@OpenAIRE CRIS v1.1@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility):: TEXT) @> ARRAY ['driver', 'openaire2.0'])
|
||||
THEN
|
||||
'driver-openaire2.0@@@OpenAIRE 2.0+ (DRIVER OA, EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['driver'])
|
||||
THEN
|
||||
'driver@@@OpenAIRE Basic (DRIVER OA)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0'])
|
||||
THEN
|
||||
'openaire2.0@@@OpenAIRE 2.0 (EC funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire3.0'])
|
||||
THEN
|
||||
'openaire3.0@@@OpenAIRE 3.0 (OA, funding)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['openaire2.0_data'])
|
||||
THEN
|
||||
'openaire2.0_data@@@OpenAIRE Data (funded, referenced datasets)@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['native'])
|
||||
THEN
|
||||
'native@@@proprietary@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['hostedBy'])
|
||||
THEN
|
||||
'hostedBy@@@collected from a compatible aggregator@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
WHEN (array_agg(DISTINCT COALESCE (a.compatibility_override, a.compatibility) :: TEXT) @> ARRAY ['notCompatible'])
|
||||
THEN
|
||||
'notCompatible@@@under validation@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
ELSE
|
||||
'UNKNOWN@@@not available@@@dnet:datasourceCompatibilityLevel@@@dnet:datasourceCompatibilityLevel'
|
||||
END AS openairecompatibility,
|
||||
d.websiteurl AS websiteurl,
|
||||
d.logourl AS logourl,
|
||||
array_agg(DISTINCT CASE WHEN a.protocol = 'oai' and last_aggregation_date is not null THEN a.baseurl ELSE NULL END) AS accessinfopackage,
|
||||
d.latitude AS latitude,
|
||||
d.longitude AS longitude,
|
||||
d.namespaceprefix AS namespaceprefix,
|
||||
NULL AS odnumberofitems,
|
||||
NULL AS odnumberofitemsdate,
|
||||
|
||||
(SELECT array_agg(s|| '###keywords@@@keywords@@@dnet:subject_classification_typologies@@@dnet:subject_classification_typologies')
|
||||
FROM UNNEST(
|
||||
ARRAY(
|
||||
SELECT trim(s)
|
||||
FROM unnest(string_to_array(d.subjects, '@@')) AS s)) AS s) AS subjects,
|
||||
|
||||
d.description AS description,
|
||||
NULL AS odpolicies,
|
||||
ARRAY(SELECT trim(s)
|
||||
FROM unnest(string_to_array(d.languages, ',')) AS s) AS odlanguages,
|
||||
ARRAY(SELECT trim(s)
|
||||
FROM unnest(string_to_array(d.od_contenttypes, '-')) AS s) AS odcontenttypes,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.9 AS trust,
|
||||
NULL AS inferenceprovenance,
|
||||
d.dateofcollection AS dateofcollection,
|
||||
d.dateofvalidation AS dateofvalidation,
|
||||
-- re3data fields
|
||||
d.releasestartdate AS releasestartdate,
|
||||
d.releaseenddate AS releaseenddate,
|
||||
d.missionstatementurl AS missionstatementurl,
|
||||
d.dataprovider AS dataprovider,
|
||||
d.serviceprovider AS serviceprovider,
|
||||
d.databaseaccesstype AS databaseaccesstype,
|
||||
d.datauploadtype AS datauploadtype,
|
||||
d.databaseaccessrestriction AS databaseaccessrestriction,
|
||||
d.datauploadrestriction AS datauploadrestriction,
|
||||
d.versioning AS versioning,
|
||||
d.citationguidelineurl AS citationguidelineurl,
|
||||
d.qualitymanagementkind AS qualitymanagementkind,
|
||||
d.pidsystems AS pidsystems,
|
||||
d.certificates AS certificates,
|
||||
ARRAY[]::text[] AS policies,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
d.typology || '@@@' || CASE
|
||||
WHEN (d.typology = 'crissystem') THEN 'CRIS System'
|
||||
WHEN (d.typology = 'datarepository::unknown') THEN 'Data Repository'
|
||||
WHEN (d.typology = 'aggregator::datarepository') THEN 'Data Repository Aggregator'
|
||||
WHEN (d.typology = 'infospace') THEN 'Information Space'
|
||||
WHEN (d.typology = 'pubsrepository::institutional') THEN 'Institutional Repository'
|
||||
WHEN (d.typology = 'aggregator::pubsrepository::institutional') THEN 'Institutional Repository Aggregator'
|
||||
WHEN (d.typology = 'pubsrepository::journal') THEN 'Journal'
|
||||
WHEN (d.typology = 'aggregator::pubsrepository::journals') THEN 'Journal Aggregator/Publisher'
|
||||
WHEN (d.typology = 'pubsrepository::mock') THEN 'Other'
|
||||
WHEN (d.typology = 'pubscatalogue::unknown') THEN 'Publication Catalogue'
|
||||
WHEN (d.typology = 'pubsrepository::unknown') THEN 'Publication Repository'
|
||||
WHEN (d.typology = 'aggregator::pubsrepository::unknown') THEN 'Publication Repository Aggregator'
|
||||
WHEN (d.typology = 'entityregistry') THEN 'Registry'
|
||||
WHEN (d.typology = 'scholarcomminfra') THEN 'Scholarly Comm. Infrastructure'
|
||||
WHEN (d.typology = 'pubsrepository::thematic') THEN 'Thematic Repository'
|
||||
WHEN (d.typology = 'websource') THEN 'Web Source'
|
||||
WHEN (d.typology = 'entityregistry::projects') THEN 'Funder database'
|
||||
WHEN (d.typology = 'entityregistry::repositories') THEN 'Registry of repositories'
|
||||
WHEN (d.typology = 'softwarerepository') THEN 'Software Repository'
|
||||
WHEN (d.typology = 'aggregator::softwarerepository') THEN 'Software Repository Aggregator'
|
||||
WHEN (d.typology = 'orprepository') THEN 'Repository'
|
||||
ELSE 'Other'
|
||||
END || '@@@dnet:datasource_typologies@@@dnet:datasource_typologies' AS datasourcetype,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
|
||||
CONCAT(d.issn, '@@@', d.eissn, '@@@', d.lissn) AS journal
|
||||
|
||||
FROM dsm_datasources d
|
||||
|
||||
LEFT OUTER JOIN dsm_datasources dc on (d.collectedfrom = dc.id)
|
||||
LEFT OUTER JOIN dsm_api a ON (d.id = a.datasource)
|
||||
LEFT OUTER JOIN dsm_datasourcepids di ON (d.id = di.datasource)
|
||||
|
||||
GROUP BY
|
||||
d.id,
|
||||
d.officialname,
|
||||
d.englishname,
|
||||
d.websiteurl,
|
||||
d.logourl,
|
||||
d.contactemail,
|
||||
d.namespaceprefix,
|
||||
d.description,
|
||||
d.latitude,
|
||||
d.longitude,
|
||||
d.dateofcollection,
|
||||
d.dateofvalidation,
|
||||
d.releasestartdate,
|
||||
d.releaseenddate,
|
||||
d.missionstatementurl,
|
||||
d.dataprovider,
|
||||
d.serviceprovider,
|
||||
d.databaseaccesstype,
|
||||
d.datauploadtype,
|
||||
d.databaseaccessrestriction,
|
||||
d.datauploadrestriction,
|
||||
d.versioning,
|
||||
d.citationguidelineurl,
|
||||
d.qualitymanagementkind,
|
||||
d.pidsystems,
|
||||
d.certificates,
|
||||
dc.id,
|
||||
dc.officialname,
|
||||
d.issn,
|
||||
d.eissn,
|
||||
d.lissn
|
|
@ -0,0 +1,36 @@
|
|||
SELECT
|
||||
o.id AS organizationid,
|
||||
o.legalshortname AS legalshortname,
|
||||
o.legalname AS legalname,
|
||||
o.websiteurl AS websiteurl,
|
||||
o.logourl AS logourl,
|
||||
o.ec_legalbody AS eclegalbody,
|
||||
o.ec_legalperson AS eclegalperson,
|
||||
o.ec_nonprofit AS ecnonprofit,
|
||||
o.ec_researchorganization AS ecresearchorganization,
|
||||
o.ec_highereducation AS echighereducation,
|
||||
o.ec_internationalorganizationeurinterests AS ecinternationalorganizationeurinterests,
|
||||
o.ec_internationalorganization AS ecinternationalorganization,
|
||||
o.ec_enterprise AS ecenterprise,
|
||||
o.ec_smevalidated AS ecsmevalidated,
|
||||
o.ec_nutscode AS ecnutscode,
|
||||
o.dateofcollection AS dateofcollection,
|
||||
o.lastupdate AS dateoftransformation,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
o.trust AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
d.id AS collectedfromid,
|
||||
d.officialname AS collectedfromname,
|
||||
|
||||
o.country || '@@@dnet:countries' AS country,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
|
||||
|
||||
ARRAY[]::text[] AS pid
|
||||
FROM dsm_organizations o
|
||||
LEFT OUTER JOIN dsm_datasources d ON (d.id = o.collectedfrom)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
SELECT
|
||||
o.id AS organizationid,
|
||||
coalesce((array_agg(a.acronym))[1], o.name) AS legalshortname,
|
||||
o.name AS legalname,
|
||||
array_agg(DISTINCT n.name) AS "alternativeNames",
|
||||
(array_agg(u.url))[1] AS websiteurl,
|
||||
o.modification_date AS dateoftransformation,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.95 AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
'openaire____::openorgs' AS collectedfromid,
|
||||
'OpenOrgs Database' AS collectedfromname,
|
||||
o.country || '@@@dnet:countries' AS country,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
|
||||
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
|
||||
FROM organizations o
|
||||
LEFT OUTER JOIN acronyms a ON (a.id = o.id)
|
||||
LEFT OUTER JOIN urls u ON (u.id = o.id)
|
||||
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
|
||||
LEFT OUTER JOIN other_names n ON (n.id = o.id)
|
||||
GROUP BY
|
||||
o.id,
|
||||
o.name,
|
||||
o.modification_date,
|
||||
o.country
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS organizationid,
|
||||
n.name AS legalshortname,
|
||||
n.name AS legalname,
|
||||
ARRAY[]::text[] AS "alternativeNames",
|
||||
(array_agg(u.url))[1] AS websiteurl,
|
||||
o.modification_date AS dateoftransformation,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
0.88 AS trust,
|
||||
'' AS inferenceprovenance,
|
||||
'openaire____::openorgs' AS collectedfromid,
|
||||
'OpenOrgs Database' AS collectedfromname,
|
||||
o.country || '@@@dnet:countries' AS country,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction,
|
||||
array_agg(DISTINCT i.otherid || '###' || i.type || '@@@dnet:pid_types') AS pid
|
||||
FROM other_names n
|
||||
LEFT OUTER JOIN organizations o ON (n.id = o.id)
|
||||
LEFT OUTER JOIN urls u ON (u.id = o.id)
|
||||
LEFT OUTER JOIN other_ids i ON (i.id = o.id)
|
||||
GROUP BY
|
||||
o.id, o.modification_date, o.country, n.name
|
||||
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
SELECT
|
||||
po.project AS project,
|
||||
po.resporganization AS resporganization,
|
||||
po.participantnumber AS participantnumber,
|
||||
po.contribution AS contribution,
|
||||
NULL AS startdate,
|
||||
NULL AS enddate,
|
||||
false AS inferred,
|
||||
false AS deletedbyinference,
|
||||
po.trust AS trust,
|
||||
NULL AS inferenceprovenance,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
po.semanticclass || '@@@' || po.semanticclass || '@@@dnet:project_organization_relations@@@dnet:project_organization_relations' AS semantics,
|
||||
'sysimport:crosswalk:entityregistry@@@sysimport:crosswalk:entityregistry@@@dnet:provenance_actions@@@dnet:provenance_actions' AS provenanceaction
|
||||
|
||||
FROM project_organization po
|
||||
LEFT OUTER JOIN projects p ON (p.id = po.project)
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
|
|
@ -0,0 +1,90 @@
|
|||
SELECT
|
||||
p.id AS projectid,
|
||||
p.code AS code,
|
||||
p.websiteurl AS websiteurl,
|
||||
p.acronym AS acronym,
|
||||
p.title AS title,
|
||||
p.startdate AS startdate,
|
||||
p.enddate AS enddate,
|
||||
p.call_identifier AS callidentifier,
|
||||
p.keywords AS keywords,
|
||||
p.duration AS duration,
|
||||
p.ec_sc39 AS ecsc39,
|
||||
p.oa_mandate_for_publications AS oamandatepublications,
|
||||
p.ec_article29_3 AS ecarticle29_3,
|
||||
p.dateofcollection AS dateofcollection,
|
||||
p.lastupdate AS dateoftransformation,
|
||||
p.inferred AS inferred,
|
||||
p.deletedbyinference AS deletedbyinference,
|
||||
p.trust AS trust,
|
||||
p.inferenceprovenance AS inferenceprovenance,
|
||||
p.optional1 AS optional1,
|
||||
p.optional2 AS optional2,
|
||||
p.jsonextrainfo AS jsonextrainfo,
|
||||
p.contactfullname AS contactfullname,
|
||||
p.contactfax AS contactfax,
|
||||
p.contactphone AS contactphone,
|
||||
p.contactemail AS contactemail,
|
||||
p.summary AS summary,
|
||||
p.currency AS currency,
|
||||
p.totalcost AS totalcost,
|
||||
p.fundedamount AS fundedamount,
|
||||
dc.id AS collectedfromid,
|
||||
dc.officialname AS collectedfromname,
|
||||
ctc.code || '@@@' || ctc.name || '@@@' || cts.code || '@@@' || cts.name AS contracttype,
|
||||
pac.code || '@@@' || pac.name || '@@@' || pas.code || '@@@' || pas.name AS provenanceaction,
|
||||
array_agg(DISTINCT i.pid || '###' || i.issuertype) AS pid,
|
||||
array_agg(DISTINCT s.name || '###' || sc.code || '@@@' || sc.name || '@@@' || ss.code || '@@@' || ss.name) AS subjects,
|
||||
array_agg(DISTINCT fp.path) AS fundingtree
|
||||
FROM projects p
|
||||
LEFT OUTER JOIN class pac ON (pac.code = p.provenanceactionclass)
|
||||
LEFT OUTER JOIN scheme pas ON (pas.code = p.provenanceactionscheme)
|
||||
|
||||
LEFT OUTER JOIN projectpids pp ON (pp.project = p.id)
|
||||
LEFT OUTER JOIN dsm_identities i ON (i.pid = pp.pid)
|
||||
|
||||
LEFT OUTER JOIN dsm_datasources dc ON (dc.id = p.collectedfrom)
|
||||
|
||||
LEFT OUTER JOIN project_fundingpath pf ON (pf.project = p.id)
|
||||
LEFT OUTER JOIN fundingpaths fp ON (fp.id = pf.funding)
|
||||
|
||||
LEFT OUTER JOIN project_subject ps ON (ps.project = p.id)
|
||||
LEFT OUTER JOIN subjects s ON (s.id = ps.subject)
|
||||
|
||||
LEFT OUTER JOIN class sc ON (sc.code = s.semanticclass)
|
||||
LEFT OUTER JOIN scheme ss ON (ss.code = s.semanticscheme)
|
||||
|
||||
LEFT OUTER JOIN class ctc ON (ctc.code = p.contracttypeclass)
|
||||
LEFT OUTER JOIN scheme cts ON (cts.code = p.contracttypescheme)
|
||||
|
||||
GROUP BY
|
||||
p.id,
|
||||
p.code,
|
||||
p.websiteurl,
|
||||
p.acronym,
|
||||
p.title,
|
||||
p.startdate,
|
||||
p.enddate,
|
||||
p.call_identifier,
|
||||
p.keywords,
|
||||
p.duration,
|
||||
p.ec_sc39,
|
||||
p.oa_mandate_for_publications,
|
||||
p.ec_article29_3,
|
||||
p.dateofcollection,
|
||||
p.inferred,
|
||||
p.deletedbyinference,
|
||||
p.trust,
|
||||
p.inferenceprovenance,
|
||||
p.contactfullname,
|
||||
p.contactfax,
|
||||
p.contactphone,
|
||||
p.contactemail,
|
||||
p.summary,
|
||||
p.currency,
|
||||
p.totalcost,
|
||||
p.fundedamount,
|
||||
dc.id,
|
||||
dc.officialname,
|
||||
pac.code, pac.name, pas.code, pas.name,
|
||||
ctc.code, ctc.name, cts.code, cts.name;
|
|
@ -0,0 +1,17 @@
|
|||
SELECT local_id AS id1, oa_original_id AS id2 FROM openaire_simrels WHERE reltype = 'is_similar'
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
o.id AS id1,
|
||||
'openorgsmesh'||substring(o.id, 13)||'-'||md5(a.acronym) AS id2
|
||||
FROM acronyms a
|
||||
LEFT OUTER JOIN organizations o ON (a.id = o.id)
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
o.id AS id1,
|
||||
'openorgsmesh'||substring(o.id, 13)||'-'||md5(n.name) AS id2
|
||||
FROM other_names n
|
||||
LEFT OUTER JOIN organizations o ON (n.id = o.id)
|
|
@ -0,0 +1,9 @@
|
|||
# Set root logger level to DEBUG and its only appender to A1.
|
||||
log4j.rootLogger=INFO, A1
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
||||
|
||||
# A1 uses PatternLayout.
|
||||
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
|
@ -1,15 +1,45 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-dedup</artifactId>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,10 +1,18 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class GraphMappingUtils {
|
||||
|
||||
public final static Map<String, Class> types = Maps.newHashMap();
|
||||
|
|
|
@ -4,12 +4,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkGraphImporterJob {
|
||||
|
||||
|
@ -33,13 +31,8 @@ public class SparkGraphImporterJob {
|
|||
|
||||
// Read the input file and convert it into RDD of serializable object
|
||||
GraphMappingUtils.types.forEach((name, clazz) -> {
|
||||
final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class)
|
||||
.map(item -> new Tuple2<>(item._1.toString(), item._2.toString()));
|
||||
|
||||
spark.createDataset(inputRDD
|
||||
.filter(s -> s._1().equals(clazz.getName()))
|
||||
.map(Tuple2::_2)
|
||||
.map(s -> new ObjectMapper().readValue(s, clazz))
|
||||
spark.createDataset(sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class)
|
||||
.map(s -> new ObjectMapper().readValue(s._2().toString(), clazz))
|
||||
.rdd(), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
sparkDriverMemory=8G
|
||||
sparkExecutorMemory=8G
|
||||
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
|
||||
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
|
||||
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
||||
outputPath=/tmp/openaire_provision
|
||||
format=TMF
|
||||
batchSize=2000
|
||||
sparkExecutorCoresForIndexing=64
|
||||
reuseRecords=true
|
|
@ -0,0 +1,92 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-graph-provision</artifactId>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.mycila.xmltool</groupId>
|
||||
<artifactId>xmltool</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-solrj</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.lucidworks.spark</groupId>
|
||||
<artifactId>spark-solr</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpmime</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.noggit</groupId>
|
||||
<artifactId>noggit</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.cxf</groupId>
|
||||
<artifactId>cxf-rt-transports-http</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-rmi-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,257 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import eu.dnetlib.dhp.graph.model.*;
|
||||
import eu.dnetlib.dhp.graph.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
|
||||
import eu.dnetlib.dhp.graph.utils.XmlRecordFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
|
||||
|
||||
/**
|
||||
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
|
||||
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
|
||||
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
|
||||
*
|
||||
* The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again
|
||||
* by E, finally grouped by E.id;
|
||||
*
|
||||
* Different manipulations of the E and R sets are introduced to reduce the complexity of the operation
|
||||
* 1) treat the object payload as string, extracting only the necessary information beforehand using json path,
|
||||
* it seems that deserializing it with jackson's object mapper has higher memory footprint.
|
||||
*
|
||||
* 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false)
|
||||
* 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
|
||||
* and E_target = T. Objects in T are heavily pruned by all the unnecessary information
|
||||
*
|
||||
* 4) perform the join as (((T join R) union S) groupby S.id) yield S -> [ <T, R> ]
|
||||
*/
|
||||
public class GraphJoiner implements Serializable {
|
||||
|
||||
public static final int MAX_RELS = 100;
|
||||
|
||||
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
||||
|
||||
private SparkSession spark;
|
||||
|
||||
private ContextMapper contextMapper;
|
||||
|
||||
private String inputPath;
|
||||
|
||||
private String outPath;
|
||||
|
||||
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String inputPath, String outPath) {
|
||||
this.spark = spark;
|
||||
this.contextMapper = contextMapper;
|
||||
this.inputPath = inputPath;
|
||||
this.outPath = outPath;
|
||||
}
|
||||
|
||||
public GraphJoiner adjacencyLists() {
|
||||
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
|
||||
|
||||
// read each entity
|
||||
JavaPairRDD<String, TypedRow> datasource = readPathEntity(sc, getInputPath(), "datasource");
|
||||
JavaPairRDD<String, TypedRow> organization = readPathEntity(sc, getInputPath(), "organization");
|
||||
JavaPairRDD<String, TypedRow> project = readPathEntity(sc, getInputPath(), "project");
|
||||
JavaPairRDD<String, TypedRow> dataset = readPathEntity(sc, getInputPath(), "dataset");
|
||||
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(sc, getInputPath(), "otherresearchproduct");
|
||||
JavaPairRDD<String, TypedRow> software = readPathEntity(sc, getInputPath(), "software");
|
||||
JavaPairRDD<String, TypedRow> publication = readPathEntity(sc, getInputPath(), "publication");
|
||||
|
||||
// create the union between all the entities
|
||||
final String entitiesPath = getOutPath() + "/entities";
|
||||
datasource
|
||||
.union(organization)
|
||||
.union(project)
|
||||
.union(dataset)
|
||||
.union(otherresearchproduct)
|
||||
.union(software)
|
||||
.union(publication)
|
||||
.map(e -> new EntityRelEntity().setSource(e._2()))
|
||||
.map(GraphMappingUtils::serialize)
|
||||
.saveAsTextFile(entitiesPath, GzipCodec.class);
|
||||
|
||||
JavaPairRDD<String, EntityRelEntity> entities = sc.textFile(entitiesPath)
|
||||
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
|
||||
.mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
|
||||
|
||||
// reads the relationships
|
||||
final JavaPairRDD<String, EntityRelEntity> relation = readPathRelation(sc, getInputPath())
|
||||
.filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted
|
||||
.map(p -> new EntityRelEntity().setRelation(p))
|
||||
.mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p))
|
||||
.groupByKey()
|
||||
.map(p -> Iterables.limit(p._2(), MAX_RELS))
|
||||
.flatMap(p -> p.iterator())
|
||||
.mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
|
||||
|
||||
//final String bySource = getOutPath() + "/1_join_by_target";
|
||||
JavaPairRDD<String, EntityRelEntity> bySource = relation
|
||||
.join(entities
|
||||
.filter(e -> !e._2().getSource().getDeleted())
|
||||
.mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
|
||||
.map(s -> new EntityRelEntity()
|
||||
.setRelation(s._2()._1().getRelation())
|
||||
.setTarget(s._2()._2().getSource()))
|
||||
.mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
|
||||
|
||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, false, schemaLocation, new HashSet<>());
|
||||
entities
|
||||
.union(bySource)
|
||||
.groupByKey() // by source id
|
||||
.map(l -> toJoinedEntity(l))
|
||||
.mapToPair(je -> new Tuple2<>(
|
||||
new Text(je.getEntity().getId()),
|
||||
new Text(recordFactory.build(je))))
|
||||
.saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public GraphJoiner asXML() {
|
||||
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
|
||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, true, "", new HashSet<>());
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
final String joinedEntitiesPath = getOutPath() + "/1_joined_entities";
|
||||
sc.textFile(joinedEntitiesPath)
|
||||
.map(s -> mapper.readValue(s, JoinedEntity.class))
|
||||
.mapToPair(je -> new Tuple2<>(new Text(je.getEntity().getId()), new Text(recordFactory.build(je))))
|
||||
.saveAsHadoopFile(getOutPath() + "/2_xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public SparkSession getSpark() {
|
||||
return spark;
|
||||
}
|
||||
|
||||
public String getInputPath() {
|
||||
return inputPath;
|
||||
}
|
||||
|
||||
public String getOutPath() {
|
||||
return outPath;
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private OafEntity parseOaf(final String json, final String type) {
|
||||
final ObjectMapper o = new ObjectMapper();
|
||||
try {
|
||||
switch (GraphMappingUtils.EntityType.valueOf(type)) {
|
||||
case publication:
|
||||
return o.readValue(json, Publication.class);
|
||||
case dataset:
|
||||
return o.readValue(json, Dataset.class);
|
||||
case otherresearchproduct:
|
||||
return o.readValue(json, OtherResearchProduct.class);
|
||||
case software:
|
||||
return o.readValue(json, Software.class);
|
||||
case datasource:
|
||||
return o.readValue(json, Datasource.class);
|
||||
case organization:
|
||||
return o.readValue(json, Organization.class);
|
||||
case project:
|
||||
return o.readValue(json, Project.class);
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid type: " + type);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
|
||||
final ObjectMapper o = new ObjectMapper();
|
||||
final JoinedEntity j = new JoinedEntity();
|
||||
final Links links2 = new Links();
|
||||
for(EntityRelEntity rel : p._2()) {
|
||||
if (rel.hasMainEntity() & j.getEntity() == null) {
|
||||
j.setType(rel.getSource().getType());
|
||||
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType()));
|
||||
}
|
||||
if (rel.hasRelatedEntity()) {
|
||||
try {
|
||||
links2.add(
|
||||
new eu.dnetlib.dhp.graph.model.Tuple2()
|
||||
.setRelation(o.readValue(rel.getRelation().getOaf(), Relation.class))
|
||||
.setRelatedEntity(o.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
j.setLinks(links2);
|
||||
if (j.getEntity() == null) {
|
||||
throw new IllegalStateException("missing main entity on '" + p._1() + "'");
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a sequence file <className, entity json serialization>,
|
||||
* extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
|
||||
* @param sc
|
||||
* @param inputPath
|
||||
* @param type
|
||||
* @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
|
||||
*/
|
||||
private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
|
||||
return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class)
|
||||
.mapToPair((PairFunction<Tuple2<Text, Text>, String, TypedRow>) item -> {
|
||||
final String s = item._2().toString();
|
||||
final DocumentContext json = JsonPath.parse(s);
|
||||
final String id = json.read("$.id");
|
||||
return new Tuple2<>(id, new TypedRow()
|
||||
.setSourceId(id)
|
||||
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
|
||||
.setType(type)
|
||||
.setOaf(s));
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file <className, relation json serialization>,
|
||||
* extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
|
||||
* @param sc
|
||||
* @param inputPath
|
||||
* @return the JavaRDD<TypedRow> containing all the relationships
|
||||
*/
|
||||
private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) {
|
||||
return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class)
|
||||
.map(item -> {
|
||||
final String s = item._2().toString();
|
||||
final DocumentContext json = JsonPath.parse(s);
|
||||
return new TypedRow()
|
||||
.setSourceId(json.read("$.source"))
|
||||
.setTargetId(json.read("$.target"))
|
||||
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
|
||||
.setType("relation")
|
||||
.setOaf(s);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import com.lucidworks.spark.util.SolrSupport;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.graph.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.dhp.graph.utils.StreamingInputDocumentFactory;
|
||||
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
public class SparkXmlIndexingJob {
|
||||
|
||||
private static final Log log = LogFactory.getLog(SparkXmlIndexingJob.class);
|
||||
|
||||
private static final Integer DEFAULT_BATCH_SIZE = 1000;
|
||||
|
||||
private static final String LAYOUT = "index";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_update_index.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
final String format = parser.get("format");
|
||||
final Integer batchSize = parser.getObjectMap().containsKey("batchSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE;
|
||||
|
||||
final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
final String fields = getLayoutSource(isLookup, format);
|
||||
final String xslt = getLayoutTransformer(isLookup);
|
||||
|
||||
final String dsId = getDsId(format, isLookup);
|
||||
final String zkHost = getZkHost(isLookup);
|
||||
final String version = getRecordDatestamp();
|
||||
|
||||
final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
|
||||
|
||||
log.info("indexRecordTransformer: " + indexRecordXslt);
|
||||
|
||||
final String master = parser.get("master");
|
||||
final SparkConf conf = new SparkConf()
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
|
||||
try(SparkSession spark = getSession(conf, master)) {
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
RDD<SolrInputDocument> docs = sc.sequenceFile(inputPath, Text.class, Text.class)
|
||||
.map(t -> t._2().toString())
|
||||
.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
|
||||
.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
|
||||
.rdd();
|
||||
|
||||
SolrSupport.indexDocs(zkHost, format + "-" + LAYOUT + "-openaire", batchSize, docs);
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession getSession(SparkConf conf, String master) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
|
||||
.master(master)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
private static String toIndexRecord(Transformer tr, final String record) {
|
||||
final StreamResult res = new StreamResult(new StringWriter());
|
||||
try {
|
||||
tr.transform(new StreamSource(new StringReader(record)), res);
|
||||
return res.getWriter().toString();
|
||||
} catch (Throwable e) {
|
||||
System.out.println("XPathException on record:\n" + record);
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the XSLT responsible for building the index xml records.
|
||||
*
|
||||
* @param format Metadata format name (DMF|TMF)
|
||||
* @param xslt xslt for building the index record transformer
|
||||
* @param fields the list of fields
|
||||
* @return the javax.xml.transform.Transformer
|
||||
* @throws ISLookUpException could happen
|
||||
* @throws IOException could happen
|
||||
* @throws TransformerException could happen
|
||||
*/
|
||||
private static String getLayoutTransformer(String format, String fields, String xslt) throws TransformerException {
|
||||
|
||||
final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt);
|
||||
final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter());
|
||||
|
||||
layoutTransformer.setParameter("format", format);
|
||||
layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt);
|
||||
|
||||
return layoutToXsltXslt.getWriter().toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* method return a solr-compatible string representation of a date, used to mark all records as indexed today
|
||||
* @return the parsed date
|
||||
*/
|
||||
public static String getRecordDatestamp() {
|
||||
return new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss'Z'").format(new Date());
|
||||
}
|
||||
|
||||
/**
|
||||
* Method retrieves from the information system the list of fields associated to the given MDFormat name
|
||||
*
|
||||
* @param isLookup the ISLookup service stub
|
||||
* @param format the Metadata format name
|
||||
* @return the string representation of the list of fields to be indexed
|
||||
*
|
||||
* @throws ISLookUpDocumentNotFoundException
|
||||
* @throws ISLookUpException
|
||||
*/
|
||||
private static String getLayoutSource(final ISLookUpService isLookup, final String format) throws ISLookUpDocumentNotFoundException, ISLookUpException {
|
||||
return doLookup(isLookup, String.format(
|
||||
"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", format, LAYOUT));
|
||||
}
|
||||
|
||||
/**
|
||||
* Method retrieves from the information system the openaireLayoutToRecordStylesheet
|
||||
*
|
||||
* @param isLookup the ISLookup service stub
|
||||
* @return the string representation of the XSLT contained in the transformation rule profile
|
||||
*
|
||||
* @throws ISLookUpDocumentNotFoundException
|
||||
* @throws ISLookUpException
|
||||
*/
|
||||
private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException {
|
||||
return doLookup(isLookup, "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" +
|
||||
"//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
|
||||
}
|
||||
|
||||
/**
|
||||
* Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
|
||||
* @param format
|
||||
* @param isLookup
|
||||
* @return the IndexDS identifier
|
||||
* @throws ISLookUpException
|
||||
*/
|
||||
private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException {
|
||||
return doLookup(isLookup, String.format("collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" +
|
||||
"//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", format));
|
||||
}
|
||||
|
||||
/**
|
||||
* Method retrieves from the information system the zookeeper quorum of the Solr server
|
||||
* @param isLookup
|
||||
* @return the zookeeper quorum of the Solr server
|
||||
* @throws ISLookUpException
|
||||
*/
|
||||
private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException {
|
||||
return doLookup(isLookup, "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
|
||||
}
|
||||
|
||||
private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException {
|
||||
log.info(String.format("running xquery: %s", xquery));
|
||||
final String res = isLookup.getResourceProfileByQuery(xquery);
|
||||
log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.graph.utils.ContextMapper;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public class SparkXmlRecordBuilderJob {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String master = parser.get("master");
|
||||
final SparkConf conf = new SparkConf()
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
|
||||
try(SparkSession spark = getSession(conf, master)) {
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String outputPath = parser.get("outputPath");
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
|
||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||
if (fs.exists(new Path(outputPath))) {
|
||||
fs.delete(new Path(outputPath), true);
|
||||
fs.mkdirs(new Path(outputPath));
|
||||
}
|
||||
|
||||
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), inputPath, outputPath)
|
||||
.adjacencyLists();
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession getSession(SparkConf conf, String master) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
|
||||
.master(master)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class EntityRelEntity implements Serializable {
|
||||
|
||||
private TypedRow source;
|
||||
private TypedRow relation;
|
||||
private TypedRow target;
|
||||
|
||||
public EntityRelEntity() {
|
||||
}
|
||||
|
||||
public EntityRelEntity(TypedRow source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
//helpers
|
||||
public Boolean hasMainEntity() {
|
||||
return getSource() != null & getRelation() == null & getTarget() == null;
|
||||
}
|
||||
|
||||
public Boolean hasRelatedEntity() {
|
||||
return getSource() == null & getRelation() != null & getTarget() != null;
|
||||
}
|
||||
|
||||
|
||||
public TypedRow getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public EntityRelEntity setSource(TypedRow source) {
|
||||
this.source = source;
|
||||
return this;
|
||||
}
|
||||
|
||||
public TypedRow getRelation() {
|
||||
return relation;
|
||||
}
|
||||
|
||||
public EntityRelEntity setRelation(TypedRow relation) {
|
||||
this.relation = relation;
|
||||
return this;
|
||||
}
|
||||
|
||||
public TypedRow getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public EntityRelEntity setTarget(TypedRow target) {
|
||||
this.target = target;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class JoinedEntity implements Serializable {
|
||||
|
||||
private String type;
|
||||
|
||||
private OafEntity entity;
|
||||
|
||||
private Links links;
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public JoinedEntity setType(String type) {
|
||||
this.type = type;
|
||||
return this;
|
||||
}
|
||||
|
||||
public OafEntity getEntity() {
|
||||
return entity;
|
||||
}
|
||||
|
||||
public JoinedEntity setEntity(OafEntity entity) {
|
||||
this.entity = entity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Links getLinks() {
|
||||
return links;
|
||||
}
|
||||
|
||||
public JoinedEntity setLinks(Links links) {
|
||||
this.links = links;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class Links extends ArrayList<Tuple2> {
|
||||
}
|
|
@ -0,0 +1,257 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class RelatedEntity implements Serializable {
|
||||
|
||||
private String id;
|
||||
private String type;
|
||||
|
||||
// common fields
|
||||
private StructuredProperty title;
|
||||
private String websiteurl; // datasource, organizations, projects
|
||||
|
||||
// results
|
||||
private String dateofacceptance;
|
||||
private String publisher;
|
||||
private List<StructuredProperty> pid;
|
||||
private String codeRepositoryUrl;
|
||||
private Qualifier resulttype;
|
||||
private List<KeyValue> collectedfrom;
|
||||
private List<Instance> instances;
|
||||
|
||||
// datasource
|
||||
private String officialname;
|
||||
private Qualifier datasourcetype;
|
||||
private Qualifier datasourcetypeui;
|
||||
private Qualifier openairecompatibility;
|
||||
//private String aggregatortype;
|
||||
|
||||
// organization
|
||||
private String legalname;
|
||||
private String legalshortname;
|
||||
private Qualifier country;
|
||||
|
||||
// project
|
||||
private String projectTitle;
|
||||
private String code;
|
||||
private String acronym;
|
||||
private Qualifier contracttype;
|
||||
private List<String> fundingtree;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public RelatedEntity setId(String id) {
|
||||
this.id = id;
|
||||
return this;
|
||||
}
|
||||
|
||||
public StructuredProperty getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public RelatedEntity setTitle(StructuredProperty title) {
|
||||
this.title = title;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getDateofacceptance() {
|
||||
return dateofacceptance;
|
||||
}
|
||||
|
||||
public RelatedEntity setDateofacceptance(String dateofacceptance) {
|
||||
this.dateofacceptance = dateofacceptance;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public RelatedEntity setPublisher(String publisher) {
|
||||
this.publisher = publisher;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<StructuredProperty> getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public RelatedEntity setPid(List<StructuredProperty> pid) {
|
||||
this.pid = pid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getCodeRepositoryUrl() {
|
||||
return codeRepositoryUrl;
|
||||
}
|
||||
|
||||
public RelatedEntity setCodeRepositoryUrl(String codeRepositoryUrl) {
|
||||
this.codeRepositoryUrl = codeRepositoryUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getResulttype() {
|
||||
return resulttype;
|
||||
}
|
||||
|
||||
public RelatedEntity setResulttype(Qualifier resulttype) {
|
||||
this.resulttype = resulttype;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<KeyValue> getCollectedfrom() {
|
||||
return collectedfrom;
|
||||
}
|
||||
|
||||
public RelatedEntity setCollectedfrom(List<KeyValue> collectedfrom) {
|
||||
this.collectedfrom = collectedfrom;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<Instance> getInstances() {
|
||||
return instances;
|
||||
}
|
||||
|
||||
public RelatedEntity setInstances(List<Instance> instances) {
|
||||
this.instances = instances;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getOfficialname() {
|
||||
return officialname;
|
||||
}
|
||||
|
||||
public RelatedEntity setOfficialname(String officialname) {
|
||||
this.officialname = officialname;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getWebsiteurl() {
|
||||
return websiteurl;
|
||||
}
|
||||
|
||||
public RelatedEntity setWebsiteurl(String websiteurl) {
|
||||
this.websiteurl = websiteurl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getDatasourcetype() {
|
||||
return datasourcetype;
|
||||
}
|
||||
|
||||
public RelatedEntity setDatasourcetype(Qualifier datasourcetype) {
|
||||
this.datasourcetype = datasourcetype;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getDatasourcetypeui() {
|
||||
return datasourcetypeui;
|
||||
}
|
||||
|
||||
public RelatedEntity setDatasourcetypeui(Qualifier datasourcetypeui) {
|
||||
this.datasourcetypeui = datasourcetypeui;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getOpenairecompatibility() {
|
||||
return openairecompatibility;
|
||||
}
|
||||
|
||||
public RelatedEntity setOpenairecompatibility(Qualifier openairecompatibility) {
|
||||
this.openairecompatibility = openairecompatibility;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getLegalname() {
|
||||
return legalname;
|
||||
}
|
||||
|
||||
public RelatedEntity setLegalname(String legalname) {
|
||||
this.legalname = legalname;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getLegalshortname() {
|
||||
return legalshortname;
|
||||
}
|
||||
|
||||
public RelatedEntity setLegalshortname(String legalshortname) {
|
||||
this.legalshortname = legalshortname;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public RelatedEntity setCountry(Qualifier country) {
|
||||
this.country = country;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public RelatedEntity setCode(String code) {
|
||||
this.code = code;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getAcronym() {
|
||||
return acronym;
|
||||
}
|
||||
|
||||
public RelatedEntity setAcronym(String acronym) {
|
||||
this.acronym = acronym;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getContracttype() {
|
||||
return contracttype;
|
||||
}
|
||||
|
||||
public RelatedEntity setContracttype(Qualifier contracttype) {
|
||||
this.contracttype = contracttype;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<String> getFundingtree() {
|
||||
return fundingtree;
|
||||
}
|
||||
|
||||
public RelatedEntity setFundingtree(List<String> fundingtree) {
|
||||
this.fundingtree = fundingtree;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getProjectTitle() {
|
||||
return projectTitle;
|
||||
}
|
||||
|
||||
public RelatedEntity setProjectTitle(String projectTitle) {
|
||||
this.projectTitle = projectTitle;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public RelatedEntity setType(String type) {
|
||||
this.type = type;
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class Tuple2 {
|
||||
|
||||
private Relation relation;
|
||||
|
||||
private RelatedEntity relatedEntity;
|
||||
|
||||
public Relation getRelation() {
|
||||
return relation;
|
||||
}
|
||||
|
||||
public Tuple2 setRelation(Relation relation) {
|
||||
this.relation = relation;
|
||||
return this;
|
||||
}
|
||||
|
||||
public RelatedEntity getRelatedEntity() {
|
||||
return relatedEntity;
|
||||
}
|
||||
|
||||
public Tuple2 setRelatedEntity(RelatedEntity relatedEntity) {
|
||||
this.relatedEntity = relatedEntity;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class TypedRow implements Serializable {
|
||||
|
||||
private String sourceId;
|
||||
|
||||
private String targetId;
|
||||
|
||||
private Boolean deleted;
|
||||
|
||||
private String type;
|
||||
|
||||
private String oaf;
|
||||
|
||||
public String getSourceId() {
|
||||
return sourceId;
|
||||
}
|
||||
|
||||
public TypedRow setSourceId(String sourceId) {
|
||||
this.sourceId = sourceId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getTargetId() {
|
||||
return targetId;
|
||||
}
|
||||
|
||||
public TypedRow setTargetId(String targetId) {
|
||||
this.targetId = targetId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Boolean getDeleted() {
|
||||
return deleted;
|
||||
}
|
||||
|
||||
public TypedRow setDeleted(Boolean deleted) {
|
||||
this.deleted = deleted;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public TypedRow setType(String type) {
|
||||
this.type = type;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getOaf() {
|
||||
return oaf;
|
||||
}
|
||||
|
||||
public TypedRow setOaf(String oaf) {
|
||||
this.oaf = oaf;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class ContextDef implements Serializable {
|
||||
|
||||
private String id;
|
||||
private String label;
|
||||
private String name;
|
||||
private String type;
|
||||
|
||||
public ContextDef(final String id, final String label, final String name, final String type) {
|
||||
super();
|
||||
this.setId(id);
|
||||
this.setLabel(label);
|
||||
this.setName(name);
|
||||
this.setType(type);
|
||||
}
|
||||
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
|
||||
public void setLabel(final String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final String type) {
|
||||
this.type = type;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class ContextMapper extends HashMap<String, ContextDef> implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 2159682308502487305L;
|
||||
|
||||
private final static String XQUERY = "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return <entry id=\"{$x/@id}\" label=\"{$x/@label|$x/@name}\" name=\"{$x/name()}\" type=\"{$x/@type}\"/>";
|
||||
|
||||
public static ContextMapper fromIS(final String isLookupUrl) throws DocumentException, ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
StringBuilder sb = new StringBuilder("<ContextDSResources>");
|
||||
Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY));
|
||||
sb.append("</ContextDSResources>");
|
||||
return fromXml(sb.toString());
|
||||
}
|
||||
|
||||
public static ContextMapper fromXml(final String xml) throws DocumentException {
|
||||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final Document doc = new SAXReader().read(new StringReader(xml));
|
||||
for (Object o : doc.selectNodes("//entry")) {
|
||||
Node node = (Node) o;
|
||||
String id = node.valueOf("./@id");
|
||||
String label = node.valueOf("./@label");
|
||||
String name = node.valueOf("./@name");
|
||||
String type = node.valueOf("./@type") + "";
|
||||
|
||||
contextMapper.put(id, new ContextDef(id, label, name, type));
|
||||
}
|
||||
return contextMapper;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,254 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.collect.BiMap;
|
||||
import com.google.common.collect.HashBiMap;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import eu.dnetlib.dhp.graph.model.EntityRelEntity;
|
||||
import eu.dnetlib.dhp.graph.model.RelatedEntity;
|
||||
import eu.dnetlib.dhp.graph.model.TypedRow;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import net.minidev.json.JSONArray;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.commons.lang3.StringUtils.*;
|
||||
|
||||
public class GraphMappingUtils {
|
||||
|
||||
public enum EntityType {
|
||||
publication, dataset, otherresearchproduct, software, datasource, organization, project
|
||||
}
|
||||
|
||||
public enum MainEntityType {
|
||||
result, datasource, organization, project
|
||||
}
|
||||
|
||||
public static Set<String> authorPidTypes = Sets.newHashSet("orcid", "magidentifier");
|
||||
|
||||
public static Set<String> instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation");
|
||||
|
||||
private static BiMap<String, String> relClassMapping = HashBiMap.create();
|
||||
|
||||
static {
|
||||
relClassMapping.put("isAuthorInstitutionOf", "hasAuthorInstitution");
|
||||
relClassMapping.put("isMergedIn", "merges");
|
||||
relClassMapping.put("isProducedBy", "produces");
|
||||
relClassMapping.put("hasParticipant", "isParticipant");
|
||||
relClassMapping.put("isProvidedBy", "provides");
|
||||
relClassMapping.put("isRelatedTo", "isRelatedTo");
|
||||
relClassMapping.put("isAmongTopNSimilarDocuments", "hasAmongTopNSimilarDocuments");
|
||||
relClassMapping.put("isRelatedTo", "isRelatedTo");
|
||||
relClassMapping.put("isSupplementTo", "isSupplementedBy");
|
||||
}
|
||||
|
||||
public static String getInverseRelClass(final String relClass) {
|
||||
String res = relClassMapping.get(relClass);
|
||||
if (isNotBlank(res)) {
|
||||
return res;
|
||||
}
|
||||
res = relClassMapping.inverse().get(relClass);
|
||||
|
||||
if (isNotBlank(res)) {
|
||||
return res;
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("unable to find an inverse relationship class for term: " + relClass);
|
||||
}
|
||||
|
||||
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
||||
|
||||
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
entityMapping.put(EntityType.publication, MainEntityType.result);
|
||||
entityMapping.put(EntityType.dataset, MainEntityType.result);
|
||||
entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result);
|
||||
entityMapping.put(EntityType.software, MainEntityType.result);
|
||||
entityMapping.put(EntityType.datasource, MainEntityType.datasource);
|
||||
entityMapping.put(EntityType.organization, MainEntityType.organization);
|
||||
entityMapping.put(EntityType.project, MainEntityType.project);
|
||||
}
|
||||
|
||||
public static String getScheme(final String sourceType, final String targetType) {
|
||||
return String.format(schemeTemplate,
|
||||
entityMapping.get(EntityType.valueOf(sourceType)).name(),
|
||||
entityMapping.get(EntityType.valueOf(targetType)).name());
|
||||
}
|
||||
|
||||
public static String getMainType(final String type) {
|
||||
return entityMapping.get(EntityType.valueOf(type)).name();
|
||||
}
|
||||
|
||||
public static boolean isResult(String type) {
|
||||
return MainEntityType.result.name().equals(getMainType(type));
|
||||
}
|
||||
|
||||
public static Predicate<String> instanceFilter = s -> instanceFieldFilter.contains(s);
|
||||
|
||||
public static EntityRelEntity asRelatedEntity(EntityRelEntity e) {
|
||||
|
||||
final DocumentContext j = JsonPath.parse(e.getSource().getOaf());
|
||||
final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType());
|
||||
|
||||
switch (EntityType.valueOf(e.getSource().getType())) {
|
||||
case publication:
|
||||
case dataset:
|
||||
case otherresearchproduct:
|
||||
case software:
|
||||
mapTitle(j, re);
|
||||
re.setDateofacceptance(j.read("$.dateofacceptance.value"));
|
||||
re.setPublisher(j.read("$.publisher.value"));
|
||||
|
||||
JSONArray pids = j.read("$.pid");
|
||||
re.setPid(pids.stream()
|
||||
.map(p -> asStructuredProperty((LinkedHashMap<String, Object>) p))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
re.setResulttype(asQualifier(j.read("$.resulttype")));
|
||||
|
||||
JSONArray collfrom = j.read("$.collectedfrom");
|
||||
re.setCollectedfrom(collfrom.stream()
|
||||
.map(c -> asKV((LinkedHashMap<String, Object>) c))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
// will throw exception when the instance is not found
|
||||
JSONArray instances = j.read("$.instance");
|
||||
re.setInstances(instances.stream()
|
||||
.map(i -> {
|
||||
final LinkedHashMap<String, Object> p = (LinkedHashMap<String, Object>) i;
|
||||
final Field<String> license = new Field<String>();
|
||||
license.setValue((String) ((LinkedHashMap<String, Object>) p.get("license")).get("value"));
|
||||
final Instance instance = new Instance();
|
||||
instance.setLicense(license);
|
||||
instance.setAccessright(asQualifier((LinkedHashMap<String, String>) p.get("accessright")));
|
||||
instance.setInstancetype(asQualifier((LinkedHashMap<String, String>) p.get("instancetype")));
|
||||
instance.setHostedby(asKV((LinkedHashMap<String, Object>) p.get("hostedby")));
|
||||
//TODO mapping of distributionlocation
|
||||
instance.setCollectedfrom(asKV((LinkedHashMap<String, Object>) p.get("collectedfrom")));
|
||||
|
||||
Field<String> dateofacceptance = new Field<String>();
|
||||
dateofacceptance.setValue((String) ((LinkedHashMap<String, Object>) p.get("dateofacceptance")).get("value"));
|
||||
instance.setDateofacceptance(dateofacceptance);
|
||||
return instance;
|
||||
}).collect(Collectors.toList()));
|
||||
|
||||
//TODO still to be mapped
|
||||
//re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
|
||||
|
||||
break;
|
||||
case datasource:
|
||||
re.setOfficialname(j.read("$.officialname.value"));
|
||||
re.setWebsiteurl(j.read("$.websiteurl.value"));
|
||||
re.setDatasourcetype(asQualifier(j.read("$.datasourcetype")));
|
||||
re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility")));
|
||||
|
||||
break;
|
||||
case organization:
|
||||
re.setLegalname(j.read("$.legalname.value"));
|
||||
re.setLegalshortname(j.read("$.legalshortname.value"));
|
||||
re.setCountry(asQualifier(j.read("$.country")));
|
||||
|
||||
break;
|
||||
case project:
|
||||
re.setProjectTitle(j.read("$.title.value"));
|
||||
re.setCode(j.read("$.code.value"));
|
||||
re.setAcronym(j.read("$.acronym.value"));
|
||||
re.setContracttype(asQualifier(j.read("$.contracttype")));
|
||||
|
||||
JSONArray f = j.read("$.fundingtree");
|
||||
if (!f.isEmpty()) {
|
||||
re.setFundingtree(f.stream()
|
||||
.map(s -> ((LinkedHashMap<String, String>) s).get("value"))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
return new EntityRelEntity().setSource(
|
||||
new TypedRow()
|
||||
.setSourceId(e.getSource().getSourceId())
|
||||
.setDeleted(e.getSource().getDeleted())
|
||||
.setType(e.getSource().getType())
|
||||
.setOaf(serialize(re)));
|
||||
}
|
||||
|
||||
private static KeyValue asKV(LinkedHashMap<String, Object> j) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
kv.setKey((String) j.get("key"));
|
||||
kv.setValue((String) j.get("value"));
|
||||
return kv;
|
||||
}
|
||||
|
||||
private static void mapTitle(DocumentContext j, RelatedEntity re) {
|
||||
final JSONArray a = j.read("$.title");
|
||||
if (!a.isEmpty()) {
|
||||
final StructuredProperty sp = asStructuredProperty((LinkedHashMap<String, Object>) a.get(0));
|
||||
if (StringUtils.isNotBlank(sp.getValue())) {
|
||||
re.setTitle(sp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static StructuredProperty asStructuredProperty(LinkedHashMap<String, Object> j) {
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
final String value = (String) j.get("value");
|
||||
if (StringUtils.isNotBlank(value)) {
|
||||
sp.setValue((String) j.get("value"));
|
||||
sp.setQualifier(asQualifier((LinkedHashMap<String, String>) j.get("qualifier")));
|
||||
}
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static Qualifier asQualifier(LinkedHashMap<String, String> j) {
|
||||
final Qualifier q = new Qualifier();
|
||||
|
||||
final String classid = j.get("classid");
|
||||
if (StringUtils.isNotBlank(classid)) {
|
||||
q.setClassid(classid);
|
||||
}
|
||||
|
||||
final String classname = j.get("classname");
|
||||
if (StringUtils.isNotBlank(classname)) {
|
||||
q.setClassname(classname);
|
||||
}
|
||||
|
||||
final String schemeid = j.get("schemeid");
|
||||
if (StringUtils.isNotBlank(schemeid)) {
|
||||
q.setSchemeid(schemeid);
|
||||
}
|
||||
|
||||
final String schemename = j.get("schemename");
|
||||
if (StringUtils.isNotBlank(schemename)) {
|
||||
q.setSchemename(schemename);
|
||||
}
|
||||
return q;
|
||||
}
|
||||
|
||||
public static String serialize(final Object o) {
|
||||
try {
|
||||
return new ObjectMapper()
|
||||
.setSerializationInclusion(JsonInclude.Include.NON_NULL)
|
||||
.writeValueAsString(o);
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new IllegalArgumentException("unable to serialize: " + o.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String removePrefix(final String s) {
|
||||
if (s.contains("|")) return substringAfter(s, "|");
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
|
||||
|
||||
public class ISLookupClientFactory {
|
||||
|
||||
private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);
|
||||
|
||||
public static ISLookUpService getLookUpService(final String isLookupUrl) {
|
||||
return getServiceStub(ISLookUpService.class, isLookupUrl);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) {
|
||||
log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
|
||||
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
|
||||
jaxWsProxyFactory.setServiceClass(clazz);
|
||||
jaxWsProxyFactory.setAddress(endpoint);
|
||||
return (T) jaxWsProxyFactory.create();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class LicenseComparator implements Comparator<Qualifier> {
|
||||
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
|
||||
if (left == null && right == null) return 0;
|
||||
if (left == null) return 1;
|
||||
if (right == null) return -1;
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if (lClass.equals(rClass)) return 0;
|
||||
|
||||
if (lClass.equals("OPEN SOURCE")) return -1;
|
||||
if (rClass.equals("OPEN SOURCE")) return 1;
|
||||
|
||||
if (lClass.equals("OPEN")) return -1;
|
||||
if (rClass.equals("OPEN")) return 1;
|
||||
|
||||
if (lClass.equals("6MONTHS")) return -1;
|
||||
if (rClass.equals("6MONTHS")) return 1;
|
||||
|
||||
if (lClass.equals("12MONTHS")) return -1;
|
||||
if (rClass.equals("12MONTHS")) return 1;
|
||||
|
||||
if (lClass.equals("EMBARGO")) return -1;
|
||||
if (rClass.equals("EMBARGO")) return 1;
|
||||
|
||||
if (lClass.equals("RESTRICTED")) return -1;
|
||||
if (rClass.equals("RESTRICTED")) return 1;
|
||||
|
||||
if (lClass.equals("CLOSED")) return -1;
|
||||
if (rClass.equals("CLOSED")) return 1;
|
||||
|
||||
if (lClass.equals("UNKNOWN")) return -1;
|
||||
if (rClass.equals("UNKNOWN")) return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,253 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import javax.xml.stream.*;
|
||||
import javax.xml.stream.events.Namespace;
|
||||
import javax.xml.stream.events.StartElement;
|
||||
import javax.xml.stream.events.XMLEvent;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
/**
|
||||
* Optimized version of the document parser, drop in replacement of InputDocumentFactory.
|
||||
*
|
||||
* <p>
|
||||
* Faster because:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>Doesn't create a DOM for the full document</li>
|
||||
* <li>Doesn't execute xpaths agains the DOM</li>
|
||||
* <li>Quickly serialize the 'result' element directly in a string.</li>
|
||||
* <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* This class is fully reentrant and can be invoked in parallel.
|
||||
* </p>
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
*/
|
||||
public class StreamingInputDocumentFactory {
|
||||
|
||||
private static final String INDEX_FIELD_PREFIX = "__";
|
||||
|
||||
private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion";
|
||||
|
||||
private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid";
|
||||
|
||||
private static final String RESULT = "result";
|
||||
|
||||
private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT;
|
||||
|
||||
private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
|
||||
|
||||
private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
|
||||
|
||||
private final static List<String> dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
|
||||
|
||||
private static final String DEFAULTDNETRESULT = "dnetResult";
|
||||
|
||||
private static final String TARGETFIELDS = "targetFields";
|
||||
|
||||
private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier";
|
||||
|
||||
private static final String ROOT_ELEMENT = "indexRecord";
|
||||
|
||||
private static final int MAX_FIELD_LENGTH = 25000;
|
||||
|
||||
private ThreadLocal<XMLInputFactory> inputFactory = ThreadLocal.withInitial(() -> XMLInputFactory.newInstance());
|
||||
|
||||
private ThreadLocal<XMLOutputFactory> outputFactory = ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance());
|
||||
|
||||
private ThreadLocal<XMLEventFactory> eventFactory = ThreadLocal.withInitial(() -> XMLEventFactory.newInstance());
|
||||
|
||||
private String version;
|
||||
|
||||
private String dsId;
|
||||
|
||||
private String resultName = DEFAULTDNETRESULT;
|
||||
|
||||
public StreamingInputDocumentFactory(final String version, final String dsId) {
|
||||
this(version, dsId, DEFAULTDNETRESULT);
|
||||
}
|
||||
|
||||
public StreamingInputDocumentFactory(final String version, final String dsId, final String resultName) {
|
||||
this.version = version;
|
||||
this.dsId = dsId;
|
||||
this.resultName = resultName;
|
||||
}
|
||||
|
||||
public SolrInputDocument parseDocument(final String inputDocument) {
|
||||
|
||||
final StringWriter results = new StringWriter();
|
||||
final List<Namespace> nsList = Lists.newLinkedList();
|
||||
try {
|
||||
|
||||
XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument));
|
||||
|
||||
final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>());
|
||||
|
||||
while (parser.hasNext()) {
|
||||
final XMLEvent event = parser.nextEvent();
|
||||
if ((event != null) && event.isStartElement()) {
|
||||
final String localName = event.asStartElement().getName().getLocalPart();
|
||||
|
||||
if (ROOT_ELEMENT.equals(localName)) {
|
||||
nsList.addAll(getNamespaces(event));
|
||||
} else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) {
|
||||
final XMLEvent text = parser.nextEvent();
|
||||
String recordId = getText(text);
|
||||
indexDocument.addField(INDEX_RECORD_ID, recordId);
|
||||
} else if (TARGETFIELDS.equals(localName)) {
|
||||
parseTargetFields(indexDocument, parser);
|
||||
} else if (resultName.equals(localName)) {
|
||||
copyResult(indexDocument, results, parser, nsList, resultName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (version != null) {
|
||||
indexDocument.addField(DS_VERSION, version);
|
||||
}
|
||||
|
||||
if (dsId != null) {
|
||||
indexDocument.addField(DS_ID, dsId);
|
||||
}
|
||||
|
||||
if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
|
||||
indexDocument.clear();
|
||||
System.err.println("missing indexrecord id:\n" + inputDocument);
|
||||
}
|
||||
|
||||
return indexDocument;
|
||||
} catch (XMLStreamException e) {
|
||||
return new SolrInputDocument();
|
||||
}
|
||||
}
|
||||
|
||||
private List<Namespace> getNamespaces(final XMLEvent event) {
|
||||
final List<Namespace> res = Lists.newLinkedList();
|
||||
@SuppressWarnings("unchecked")
|
||||
Iterator<Namespace> nsIter = event.asStartElement().getNamespaces();
|
||||
while (nsIter.hasNext()) {
|
||||
Namespace ns = nsIter.next();
|
||||
res.add(ns);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the targetFields block and add fields to the solr document.
|
||||
*
|
||||
* @param indexDocument
|
||||
* @param parser
|
||||
* @throws XMLStreamException
|
||||
*/
|
||||
protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException {
|
||||
|
||||
boolean hasFields = false;
|
||||
|
||||
while (parser.hasNext()) {
|
||||
final XMLEvent targetEvent = parser.nextEvent();
|
||||
if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (targetEvent.isStartElement()) {
|
||||
final String fieldName = targetEvent.asStartElement().getName().getLocalPart();
|
||||
final XMLEvent text = parser.nextEvent();
|
||||
|
||||
String data = getText(text);
|
||||
|
||||
addField(indexDocument, fieldName, data);
|
||||
hasFields = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFields) {
|
||||
indexDocument.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy the /indexRecord/result element and children, preserving namespace declarations etc.
|
||||
*
|
||||
* @param indexDocument
|
||||
* @param results
|
||||
* @param parser
|
||||
* @param nsList
|
||||
* @throws XMLStreamException
|
||||
*/
|
||||
protected void copyResult(final SolrInputDocument indexDocument,
|
||||
final StringWriter results,
|
||||
final XMLEventReader parser,
|
||||
final List<Namespace> nsList,
|
||||
final String dnetResult) throws XMLStreamException {
|
||||
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results);
|
||||
|
||||
for (Namespace ns : nsList) {
|
||||
eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI());
|
||||
}
|
||||
|
||||
StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator());
|
||||
|
||||
// new root record
|
||||
writer.add(newRecord);
|
||||
|
||||
// copy the rest as it is
|
||||
while (parser.hasNext()) {
|
||||
final XMLEvent resultEvent = parser.nextEvent();
|
||||
|
||||
// TODO: replace with depth tracking instead of close tag tracking.
|
||||
if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) {
|
||||
writer.add(eventFactory.get().createEndElement("", null, RESULT));
|
||||
break;
|
||||
}
|
||||
|
||||
writer.add(resultEvent);
|
||||
}
|
||||
writer.close();
|
||||
indexDocument.addField(INDEX_RESULT, results.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper used to add a field to a solr doc. It avoids to add empy fields
|
||||
*
|
||||
* @param indexDocument
|
||||
* @param field
|
||||
* @param value
|
||||
*/
|
||||
private final void addField(final SolrInputDocument indexDocument, final String field, final String value) {
|
||||
String cleaned = value.trim();
|
||||
if (!cleaned.isEmpty()) {
|
||||
// log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n");
|
||||
indexDocument.addField(field.toLowerCase(), cleaned);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper used to get the string from a text element.
|
||||
*
|
||||
* @param text
|
||||
* @return the
|
||||
*/
|
||||
protected final String getText(final XMLEvent text) {
|
||||
if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart());
|
||||
return "";
|
||||
|
||||
final String data = text.asCharacters().getData();
|
||||
if (data != null && data.length() > MAX_FIELD_LENGTH) {
|
||||
return data.substring(0, MAX_FIELD_LENGTH);
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.stringtemplate.v4.ST;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix;
|
||||
import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.escapeXml;
|
||||
|
||||
public class TemplateFactory {
|
||||
|
||||
private TemplateResources resources;
|
||||
|
||||
private final static char DELIMITER = '$';
|
||||
|
||||
public TemplateFactory() {
|
||||
try {
|
||||
resources = new TemplateResources();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public String buildBody(final String type, final List<String> metadata, final List<String> rels, final List<String> children, final List<String> extraInfo) {
|
||||
ST body = getTemplate(resources.getEntity());
|
||||
|
||||
body.add("name", type);
|
||||
body.add("metadata", metadata);
|
||||
body.add("rels", rels);
|
||||
body.add("children", children);
|
||||
body.add("extrainfo", extraInfo);
|
||||
|
||||
return body.render();
|
||||
}
|
||||
|
||||
public String getChild(final String name, final String id, final List<String> metadata) {
|
||||
return getTemplate(resources.getChild())
|
||||
.add("name", name)
|
||||
.add("hasId", !(id == null))
|
||||
.add("id", id != null ? escapeXml(removePrefix(id)) : "")
|
||||
.add("metadata", metadata)
|
||||
.render();
|
||||
}
|
||||
|
||||
public String buildRecord(
|
||||
final OafEntity entity,
|
||||
final String schemaLocation,
|
||||
final String body) {
|
||||
return getTemplate(resources.getRecord())
|
||||
.add("id", escapeXml(removePrefix(entity.getId())))
|
||||
.add("dateofcollection", entity.getDateofcollection())
|
||||
.add("dateoftransformation", entity.getDateoftransformation())
|
||||
.add("schemaLocation", schemaLocation)
|
||||
.add("it", body)
|
||||
.render();
|
||||
}
|
||||
|
||||
public String getRel(final String type,
|
||||
final String objIdentifier,
|
||||
final Collection<String> fields,
|
||||
final String semanticclass,
|
||||
final String semantischeme,
|
||||
final DataInfo info) {
|
||||
return getTemplate(resources.getRel())
|
||||
.add("type", type)
|
||||
.add("objIdentifier", escapeXml(removePrefix(objIdentifier)))
|
||||
.add("class", semanticclass)
|
||||
.add("scheme", semantischeme)
|
||||
.add("metadata", fields)
|
||||
.add("inferred", info.getInferred())
|
||||
.add("trust", info.getTrust())
|
||||
.add("inferenceprovenance", info.getInferenceprovenance())
|
||||
.add("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")
|
||||
.render();
|
||||
}
|
||||
|
||||
public String getInstance(final String resultId, final List<String> instancemetadata, final List<String> webresources) {
|
||||
return getTemplate(resources.getInstance())
|
||||
.add("instanceId", escapeXml(removePrefix(resultId)))
|
||||
.add("metadata", instancemetadata)
|
||||
.add("webresources", webresources
|
||||
.stream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(w -> getWebResource(w))
|
||||
.collect(Collectors.toList()))
|
||||
.render();
|
||||
}
|
||||
|
||||
private String getWebResource(final String identifier) {
|
||||
return getTemplate(resources.getWebresource())
|
||||
.add("identifier", escapeXml(identifier))
|
||||
.render();
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private ST getTemplate(final String res) {
|
||||
return new ST(res, DELIMITER, DELIMITER);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import com.google.common.io.Resources;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class TemplateResources {
|
||||
|
||||
private String record = read("eu/dnetlib/dhp/graph/template/record.st");
|
||||
|
||||
private String instance = read("eu/dnetlib/dhp/graph/template/instance.st");
|
||||
|
||||
private String rel = read("eu/dnetlib/dhp/graph/template/rel.st");
|
||||
|
||||
private String webresource = read("eu/dnetlib/dhp/graph/template/webresource.st");
|
||||
|
||||
private String child = read("eu/dnetlib/dhp/graph/template/child.st");
|
||||
|
||||
private String entity = read("eu/dnetlib/dhp/graph/template/entity.st");
|
||||
|
||||
private static String read(final String classpathResource) throws IOException {
|
||||
return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public TemplateResources() throws IOException {
|
||||
|
||||
}
|
||||
|
||||
public String getEntity() {
|
||||
return entity;
|
||||
}
|
||||
|
||||
public String getRecord() {
|
||||
return record;
|
||||
}
|
||||
|
||||
public String getInstance() {
|
||||
return instance;
|
||||
}
|
||||
|
||||
public String getRel() {
|
||||
return rel;
|
||||
}
|
||||
|
||||
public String getWebresource() {
|
||||
return webresource;
|
||||
}
|
||||
|
||||
public String getChild() {
|
||||
return child;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,962 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.mycila.xmltool.XMLDoc;
|
||||
import com.mycila.xmltool.XMLTag;
|
||||
import eu.dnetlib.dhp.graph.model.JoinedEntity;
|
||||
import eu.dnetlib.dhp.graph.model.RelatedEntity;
|
||||
import eu.dnetlib.dhp.graph.model.Tuple2;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.OutputFormat;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.dom4j.io.XMLWriter;
|
||||
|
||||
import javax.xml.transform.*;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.*;
|
||||
import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.*;
|
||||
import static org.apache.commons.lang3.StringUtils.isNotBlank;
|
||||
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||
|
||||
public class XmlRecordFactory implements Serializable {
|
||||
|
||||
private Set<String> specialDatasourceTypes;
|
||||
|
||||
private ContextMapper contextMapper;
|
||||
|
||||
private String schemaLocation;
|
||||
|
||||
private Set<String> contextes = Sets.newHashSet();
|
||||
|
||||
private boolean indent = false;
|
||||
|
||||
public XmlRecordFactory(
|
||||
final ContextMapper contextMapper, final boolean indent,
|
||||
final String schemaLocation, final Set<String> otherDatasourceTypesUForUI) {
|
||||
|
||||
this.contextMapper = contextMapper;
|
||||
this.schemaLocation = schemaLocation;
|
||||
this.specialDatasourceTypes = otherDatasourceTypesUForUI;
|
||||
|
||||
this.indent = indent;
|
||||
}
|
||||
|
||||
public String build(final JoinedEntity je) {
|
||||
final OafEntity entity = je.getEntity();
|
||||
TemplateFactory templateFactory = new TemplateFactory();
|
||||
try {
|
||||
final List<String> metadata = metadata(je.getType(), entity);
|
||||
|
||||
// rels has to be processed before the contexts because they enrich the contextMap with the funding info.
|
||||
final List<String> relations = listRelations(je, templateFactory);
|
||||
|
||||
metadata.addAll(buildContexts(getMainType(je.getType())));
|
||||
metadata.add(parseDataInfo(entity.getDataInfo()));
|
||||
|
||||
final String body = templateFactory.buildBody(
|
||||
getMainType(je.getType()),
|
||||
metadata,
|
||||
relations,
|
||||
listChildren(je, templateFactory), listExtraInfo(je));
|
||||
|
||||
return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent);
|
||||
} catch (final Throwable e) {
|
||||
throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e);
|
||||
}
|
||||
}
|
||||
|
||||
private String printXML(String xml, boolean indent) {
|
||||
try {
|
||||
final Document doc = new SAXReader().read(new StringReader(xml));
|
||||
OutputFormat format = indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat();
|
||||
format.setExpandEmptyElements(false);
|
||||
format.setSuppressDeclaration(true);
|
||||
StringWriter sw = new StringWriter();
|
||||
XMLWriter writer = new XMLWriter(sw, format);
|
||||
writer.write(doc);
|
||||
return sw.toString();
|
||||
} catch (IOException | DocumentException e) {
|
||||
throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> metadata(final String type, final OafEntity entity) {
|
||||
|
||||
final List<String> metadata = Lists.newArrayList();
|
||||
|
||||
if (entity.getCollectedfrom() != null) {
|
||||
metadata.addAll(entity.getCollectedfrom()
|
||||
.stream()
|
||||
.map(kv -> mapKeyValue("collectedfrom", kv))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (entity.getOriginalId() != null) {
|
||||
metadata.addAll(entity.getOriginalId()
|
||||
.stream()
|
||||
.map(s -> asXmlElement("originalId", s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (entity.getPid() != null) {
|
||||
metadata.addAll(entity.getPid()
|
||||
.stream()
|
||||
.map(p -> mapStructuredProperty("pid", p))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (GraphMappingUtils.isResult(type)) {
|
||||
final Result r = (Result) entity;
|
||||
|
||||
if (r.getTitle() != null) {
|
||||
metadata.addAll(r.getTitle()
|
||||
.stream()
|
||||
.map(t -> mapStructuredProperty("title", t))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getBestaccessright() != null) {
|
||||
metadata.add(mapQualifier("bestaccessright", r.getBestaccessright()));
|
||||
}
|
||||
if (r.getAuthor() != null) {
|
||||
metadata.addAll(r.getAuthor()
|
||||
.stream()
|
||||
.map(a -> {
|
||||
final StringBuilder sb = new StringBuilder("<creator rank=\"" + a.getRank() + "\"");
|
||||
if (isNotBlank(a.getName())) {
|
||||
sb.append(" name=\"" + escapeXml(a.getName()) + "\"");
|
||||
}
|
||||
if (isNotBlank(a.getSurname())) {
|
||||
sb.append(" surname=\"" + escapeXml(a.getSurname()) + "\"");
|
||||
}
|
||||
if (a.getPid() != null) {
|
||||
a.getPid().stream()
|
||||
.filter(sp -> isNotBlank(sp.getQualifier().getClassid()) && isNotBlank(sp.getValue()))
|
||||
.forEach(sp -> {
|
||||
String pidType = escapeXml(sp.getQualifier().getClassid()).replaceAll("\\W", "");
|
||||
String pidValue = escapeXml(sp.getValue());
|
||||
|
||||
// ugly hack: some records provide swapped pidtype and pidvalue
|
||||
if (authorPidTypes.contains(pidValue.toLowerCase().trim())) {
|
||||
sb.append(String.format(" %s=\"%s\"", pidValue, pidType));
|
||||
} else {
|
||||
pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", "");
|
||||
if (isNotBlank(pidType)) {
|
||||
sb.append(String.format(" %s=\"%s\"",
|
||||
pidType,
|
||||
pidValue.toLowerCase().replaceAll("orcid", "")));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
sb.append(">" + escapeXml(a.getFullname()) + "</creator>");
|
||||
return sb.toString();
|
||||
}).collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getContributor() != null) {
|
||||
metadata.addAll(r.getContributor()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("contributor", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getCountry() != null) {
|
||||
metadata.addAll(r.getCountry()
|
||||
.stream()
|
||||
.map(c -> mapQualifier("country", c))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getCoverage() != null) {
|
||||
metadata.addAll(r.getCoverage()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("coverage", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getDateofacceptance() != null) {
|
||||
metadata.add(asXmlElement("dateofacceptance", r.getDateofacceptance().getValue()));
|
||||
}
|
||||
if (r.getDescription() != null) {
|
||||
metadata.addAll(r.getDescription()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("description", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getEmbargoenddate() != null) {
|
||||
metadata.add(asXmlElement("embargoenddate", r.getEmbargoenddate().getValue()));
|
||||
}
|
||||
if (r.getSubject() != null) {
|
||||
metadata.addAll(r.getSubject()
|
||||
.stream()
|
||||
.map(s -> mapStructuredProperty("subject", s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getLanguage() != null) {
|
||||
metadata.add(mapQualifier("language", r.getLanguage()));
|
||||
}
|
||||
if (r.getRelevantdate() != null) {
|
||||
metadata.addAll(r.getRelevantdate()
|
||||
.stream()
|
||||
.map(s -> mapStructuredProperty("relevantdate", s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getPublisher() != null) {
|
||||
metadata.add(asXmlElement("publisher", r.getPublisher().getValue()));
|
||||
}
|
||||
if (r.getSource() != null) {
|
||||
metadata.addAll(r.getSource()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("source", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getFormat() != null) {
|
||||
metadata.addAll(r.getFormat()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("format", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getResulttype() != null) {
|
||||
metadata.add(mapQualifier("resulttype", r.getResulttype()));
|
||||
}
|
||||
if (r.getResourcetype() != null) {
|
||||
metadata.add(mapQualifier("resourcetype", r.getResourcetype()));
|
||||
}
|
||||
|
||||
metadata.add(mapQualifier("bestaccessright", getBestAccessright(r)));
|
||||
|
||||
if (r.getContext() != null) {
|
||||
contextes.addAll(r.getContext()
|
||||
.stream()
|
||||
.map(c -> c.getId())
|
||||
.collect(Collectors.toList()));
|
||||
if (contextes.contains("dh-ch::subcommunity::2")) {
|
||||
contextes.add("clarin");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (EntityType.valueOf(type)) {
|
||||
case publication:
|
||||
final Publication pub = (Publication) entity;
|
||||
|
||||
if (pub.getJournal() != null) {
|
||||
final Journal j = pub.getJournal();
|
||||
metadata.add(mapJournal(j));
|
||||
}
|
||||
|
||||
break;
|
||||
case dataset:
|
||||
final Dataset d = (Dataset) entity;
|
||||
if (d.getDevice() != null) {
|
||||
metadata.add(asXmlElement("device", d.getDevice().getValue()));
|
||||
}
|
||||
if (d.getLastmetadataupdate() != null) {
|
||||
metadata.add(asXmlElement("lastmetadataupdate", d.getLastmetadataupdate().getValue()));
|
||||
}
|
||||
if (d.getMetadataversionnumber() != null) {
|
||||
metadata.add(asXmlElement("metadataversionnumber", d.getMetadataversionnumber().getValue()));
|
||||
}
|
||||
if (d.getSize() != null) {
|
||||
metadata.add(asXmlElement("size", d.getSize().getValue()));
|
||||
}
|
||||
if (d.getStoragedate() != null) {
|
||||
metadata.add(asXmlElement("storagedate", d.getStoragedate().getValue()));
|
||||
}
|
||||
if (d.getVersion() != null) {
|
||||
metadata.add(asXmlElement("version", d.getVersion().getValue()));
|
||||
}
|
||||
//TODO d.getGeolocation()
|
||||
|
||||
break;
|
||||
case otherresearchproduct:
|
||||
final OtherResearchProduct orp = (OtherResearchProduct) entity;
|
||||
|
||||
if (orp.getContactperson() != null) {
|
||||
metadata.addAll(orp.getContactperson()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("contactperson", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (orp.getContactgroup() != null) {
|
||||
metadata.addAll(orp.getContactgroup()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("contactgroup", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (orp.getTool() != null) {
|
||||
metadata.addAll(orp.getTool()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("tool", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
break;
|
||||
case software:
|
||||
final Software s = (Software) entity;
|
||||
|
||||
if (s.getDocumentationUrl() != null) {
|
||||
metadata.addAll(s.getDocumentationUrl()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("documentationUrl", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (s.getLicense() != null) {
|
||||
metadata.addAll(s.getLicense()
|
||||
.stream()
|
||||
.map(l -> mapStructuredProperty("license", l))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (s.getCodeRepositoryUrl() != null) {
|
||||
metadata.add(asXmlElement("codeRepositoryUrl", s.getCodeRepositoryUrl().getValue()));
|
||||
}
|
||||
if (s.getProgrammingLanguage() != null) {
|
||||
metadata.add(mapQualifier("programmingLanguage", s.getProgrammingLanguage()));
|
||||
}
|
||||
break;
|
||||
case datasource:
|
||||
final Datasource ds = (Datasource) entity;
|
||||
|
||||
if (ds.getDatasourcetype() != null) {
|
||||
mapDatasourceType(metadata, ds.getDatasourcetype());
|
||||
}
|
||||
if (ds.getOpenairecompatibility() != null) {
|
||||
metadata.add(mapQualifier("openairecompatibility", ds.getOpenairecompatibility()));
|
||||
}
|
||||
if (ds.getOfficialname() != null) {
|
||||
metadata.add(asXmlElement("officialname", ds.getOfficialname().getValue()));
|
||||
}
|
||||
if (ds.getEnglishname() != null) {
|
||||
metadata.add(asXmlElement("englishname", ds.getEnglishname().getValue()));
|
||||
}
|
||||
if (ds.getWebsiteurl() != null) {
|
||||
metadata.add(asXmlElement("websiteurl", ds.getWebsiteurl().getValue()));
|
||||
}
|
||||
if (ds.getLogourl() != null) {
|
||||
metadata.add(asXmlElement("logourl", ds.getLogourl().getValue()));
|
||||
}
|
||||
if (ds.getContactemail() != null) {
|
||||
metadata.add(asXmlElement("contactemail", ds.getContactemail().getValue()));
|
||||
}
|
||||
if (ds.getNamespaceprefix() != null) {
|
||||
metadata.add(asXmlElement("namespaceprefix", ds.getNamespaceprefix().getValue()));
|
||||
}
|
||||
if (ds.getLatitude() != null) {
|
||||
metadata.add(asXmlElement("latitude", ds.getLatitude().getValue()));
|
||||
}
|
||||
if (ds.getLongitude() != null) {
|
||||
metadata.add(asXmlElement("longitude", ds.getLongitude().getValue()));
|
||||
}
|
||||
if (ds.getDateofvalidation() != null) {
|
||||
metadata.add(asXmlElement("dateofvalidation", ds.getDateofvalidation().getValue()));
|
||||
}
|
||||
if (ds.getDescription() != null) {
|
||||
metadata.add(asXmlElement("description", ds.getDescription().getValue()));
|
||||
}
|
||||
if (ds.getOdnumberofitems() != null) {
|
||||
metadata.add(asXmlElement("odnumberofitems", ds.getOdnumberofitems().getValue()));
|
||||
}
|
||||
if (ds.getOdnumberofitemsdate() != null) {
|
||||
metadata.add(asXmlElement("odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue()));
|
||||
}
|
||||
if (ds.getOdpolicies() != null) {
|
||||
metadata.add(asXmlElement("odpolicies", ds.getOdpolicies().getValue()));
|
||||
}
|
||||
if (ds.getOdlanguages() != null) {
|
||||
metadata.addAll(ds.getOdlanguages()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("odlanguages", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getOdcontenttypes() != null) {
|
||||
metadata.addAll(ds.getOdcontenttypes()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("odcontenttypes", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getAccessinfopackage() != null) {
|
||||
metadata.addAll(ds.getAccessinfopackage()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("accessinfopackage", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getReleaseenddate() != null) {
|
||||
metadata.add(asXmlElement("releasestartdate", ds.getReleaseenddate().getValue()));
|
||||
}
|
||||
if (ds.getReleaseenddate() != null) {
|
||||
metadata.add(asXmlElement("releaseenddate", ds.getReleaseenddate().getValue()));
|
||||
}
|
||||
if (ds.getMissionstatementurl() != null) {
|
||||
metadata.add(asXmlElement("missionstatementurl", ds.getMissionstatementurl().getValue()));
|
||||
}
|
||||
if (ds.getDataprovider() != null) {
|
||||
metadata.add(asXmlElement("dataprovider", ds.getDataprovider().getValue().toString()));
|
||||
}
|
||||
if (ds.getServiceprovider() != null) {
|
||||
metadata.add(asXmlElement("serviceprovider", ds.getServiceprovider().getValue().toString()));
|
||||
}
|
||||
if (ds.getDatabaseaccesstype() != null) {
|
||||
metadata.add(asXmlElement("databaseaccesstype", ds.getDatabaseaccesstype().getValue()));
|
||||
}
|
||||
if (ds.getDatauploadtype() != null) {
|
||||
metadata.add(asXmlElement("datauploadtype", ds.getDatauploadtype().getValue()));
|
||||
}
|
||||
if (ds.getDatabaseaccessrestriction() != null) {
|
||||
metadata.add(asXmlElement("databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue()));
|
||||
}
|
||||
if (ds.getDatauploadrestriction() != null) {
|
||||
metadata.add(asXmlElement("datauploadrestriction", ds.getDatauploadrestriction().getValue()));
|
||||
}
|
||||
if (ds.getVersioning() != null) {
|
||||
metadata.add(asXmlElement("versioning", ds.getVersioning().getValue().toString()));
|
||||
}
|
||||
if (ds.getCitationguidelineurl() != null) {
|
||||
metadata.add(asXmlElement("citationguidelineurl", ds.getCitationguidelineurl().getValue()));
|
||||
}
|
||||
if (ds.getQualitymanagementkind() != null) {
|
||||
metadata.add(asXmlElement("qualitymanagementkind", ds.getQualitymanagementkind().getValue()));
|
||||
}
|
||||
if (ds.getPidsystems() != null) {
|
||||
metadata.add(asXmlElement("pidsystems", ds.getPidsystems().getValue()));
|
||||
}
|
||||
if (ds.getCertificates() != null) {
|
||||
metadata.add(asXmlElement("certificates", ds.getCertificates().getValue()));
|
||||
}
|
||||
if (ds.getPolicies() != null) {
|
||||
metadata.addAll(ds.getPolicies()
|
||||
.stream()
|
||||
.map(kv -> mapKeyValue("policies", kv))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getJournal() != null) {
|
||||
metadata.add(mapJournal(ds.getJournal()));
|
||||
}
|
||||
if (ds.getSubjects() != null) {
|
||||
metadata.addAll(ds.getSubjects()
|
||||
.stream()
|
||||
.map(sp -> mapStructuredProperty("subject", sp))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
break;
|
||||
case organization:
|
||||
final Organization o = (Organization) entity;
|
||||
|
||||
if (o.getLegalshortname() != null) {
|
||||
metadata.add(asXmlElement("legalshortname", o.getLegalshortname().getValue()));
|
||||
}
|
||||
if (o.getLegalname() != null) {
|
||||
metadata.add(asXmlElement("legalname", o.getLegalname().getValue()));
|
||||
}
|
||||
if (o.getAlternativeNames() != null) {
|
||||
metadata.addAll(o.getAlternativeNames()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("alternativeNames", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (o.getWebsiteurl() != null) {
|
||||
metadata.add(asXmlElement("websiteurl", o.getWebsiteurl().getValue()));
|
||||
}
|
||||
if (o.getLogourl() != null) {
|
||||
metadata.add(asXmlElement("websiteurl", o.getLogourl().getValue()));
|
||||
}
|
||||
|
||||
if (o.getEclegalbody() != null) {
|
||||
metadata.add(asXmlElement("eclegalbody", o.getEclegalbody().getValue()));
|
||||
}
|
||||
if (o.getEclegalperson() != null) {
|
||||
metadata.add(asXmlElement("eclegalperson", o.getEclegalperson().getValue()));
|
||||
}
|
||||
if (o.getEcnonprofit() != null) {
|
||||
metadata.add(asXmlElement("ecnonprofit", o.getEcnonprofit().getValue()));
|
||||
}
|
||||
if (o.getEcresearchorganization() != null) {
|
||||
metadata.add(asXmlElement("ecresearchorganization", o.getEcresearchorganization().getValue()));
|
||||
}
|
||||
if (o.getEchighereducation() != null) {
|
||||
metadata.add(asXmlElement("echighereducation", o.getEchighereducation().getValue()));
|
||||
}
|
||||
if (o.getEcinternationalorganization() != null) {
|
||||
metadata.add(asXmlElement("ecinternationalorganizationeurinterests", o.getEcinternationalorganization().getValue()));
|
||||
}
|
||||
if (o.getEcinternationalorganization() != null) {
|
||||
metadata.add(asXmlElement("ecinternationalorganization", o.getEcinternationalorganization().getValue()));
|
||||
}
|
||||
if (o.getEcenterprise() != null) {
|
||||
metadata.add(asXmlElement("ecenterprise", o.getEcenterprise().getValue()));
|
||||
}
|
||||
if (o.getEcsmevalidated() != null) {
|
||||
metadata.add(asXmlElement("ecsmevalidated", o.getEcsmevalidated().getValue()));
|
||||
}
|
||||
if (o.getEcnutscode() != null) {
|
||||
metadata.add(asXmlElement("ecnutscode", o.getEcnutscode().getValue()));
|
||||
}
|
||||
if (o.getCountry() != null) {
|
||||
metadata.add(mapQualifier("country", o.getCountry()));
|
||||
}
|
||||
|
||||
break;
|
||||
case project:
|
||||
|
||||
final Project p = (Project) entity;
|
||||
|
||||
if (p.getWebsiteurl() != null) {
|
||||
metadata.add(asXmlElement("websiteurl", p.getWebsiteurl().getValue()));
|
||||
}
|
||||
if (p.getCode() != null) {
|
||||
metadata.add(asXmlElement("code", p.getCode().getValue()));
|
||||
}
|
||||
if (p.getAcronym() != null) {
|
||||
metadata.add(asXmlElement("acronym", p.getAcronym().getValue()));
|
||||
}
|
||||
if (p.getTitle() != null) {
|
||||
metadata.add(asXmlElement("title", p.getTitle().getValue()));
|
||||
}
|
||||
if (p.getStartdate() != null) {
|
||||
metadata.add(asXmlElement("startdate", p.getStartdate().getValue()));
|
||||
}
|
||||
if (p.getEnddate() != null) {
|
||||
metadata.add(asXmlElement("enddate", p.getEnddate().getValue()));
|
||||
}
|
||||
if (p.getCallidentifier() != null) {
|
||||
metadata.add(asXmlElement("callidentifier", p.getCallidentifier().getValue()));
|
||||
}
|
||||
if (p.getKeywords() != null) {
|
||||
metadata.add(asXmlElement("keywords", p.getKeywords().getValue()));
|
||||
}
|
||||
if (p.getDuration() != null) {
|
||||
metadata.add(asXmlElement("duration", p.getDuration().getValue()));
|
||||
}
|
||||
if (p.getEcarticle29_3() != null) {
|
||||
metadata.add(asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue()));
|
||||
}
|
||||
if (p.getSubjects() != null) {
|
||||
metadata.addAll(p.getSubjects()
|
||||
.stream()
|
||||
.map(sp -> mapStructuredProperty("subject", sp))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (p.getContracttype() != null) {
|
||||
metadata.add(mapQualifier("contracttype", p.getContracttype()));
|
||||
}
|
||||
if (p.getEcsc39() != null) {
|
||||
metadata.add(asXmlElement("ecsc39", p.getEcsc39().getValue()));
|
||||
}
|
||||
if (p.getContactfullname() != null) {
|
||||
metadata.add(asXmlElement("contactfullname", p.getContactfullname().getValue()));
|
||||
}
|
||||
if (p.getContactfax() != null) {
|
||||
metadata.add(asXmlElement("contactfax", p.getContactfax().getValue()));
|
||||
}
|
||||
if (p.getContactphone() != null) {
|
||||
metadata.add(asXmlElement("contactphone", p.getContactphone().getValue()));
|
||||
}
|
||||
if (p.getContactemail() != null) {
|
||||
metadata.add(asXmlElement("contactemail", p.getContactemail().getValue()));
|
||||
}
|
||||
if (p.getSummary() != null) {
|
||||
metadata.add(asXmlElement("summary", p.getSummary().getValue()));
|
||||
}
|
||||
if (p.getCurrency() != null) {
|
||||
metadata.add(asXmlElement("currency", p.getCurrency().getValue()));
|
||||
}
|
||||
if (p.getTotalcost() != null) {
|
||||
metadata.add(asXmlElement("totalcost", p.getTotalcost().toString()));
|
||||
}
|
||||
if (p.getFundedamount() != null) {
|
||||
metadata.add(asXmlElement("fundedamount", p.getFundedamount().toString()));
|
||||
}
|
||||
if (p.getFundingtree() != null) {
|
||||
metadata.addAll(p.getFundingtree()
|
||||
.stream()
|
||||
.map(ft -> asXmlElement("fundingtree", ft.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid entity type: " + type);
|
||||
}
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
private void mapDatasourceType(List<String> metadata, final Qualifier dsType) {
|
||||
metadata.add(mapQualifier("datasourcetype", dsType));
|
||||
|
||||
if (specialDatasourceTypes.contains(dsType.getClassid())) {
|
||||
dsType.setClassid("other");
|
||||
dsType.setClassname("other");
|
||||
}
|
||||
metadata.add(mapQualifier("datasourcetypeui", dsType));
|
||||
}
|
||||
|
||||
private Qualifier getBestAccessright(final Result r) {
|
||||
Qualifier bestAccessRight = new Qualifier();
|
||||
bestAccessRight.setClassid("UNKNOWN");
|
||||
bestAccessRight.setClassname("not available");
|
||||
bestAccessRight.setSchemeid("dnet:access_modes");
|
||||
bestAccessRight.setSchemename("dnet:access_modes");
|
||||
|
||||
final LicenseComparator lc = new LicenseComparator();
|
||||
for (final Instance instance : r.getInstance()) {
|
||||
if (lc.compare(bestAccessRight, instance.getAccessright()) > 0) {
|
||||
bestAccessRight = instance.getAccessright();
|
||||
}
|
||||
}
|
||||
return bestAccessRight;
|
||||
}
|
||||
|
||||
private List<String> listRelations(final JoinedEntity je, TemplateFactory templateFactory) {
|
||||
final List<String> rels = Lists.newArrayList();
|
||||
|
||||
for (final Tuple2 link : je.getLinks()) {
|
||||
|
||||
final Relation rel = link.getRelation();
|
||||
final RelatedEntity re = link.getRelatedEntity();
|
||||
final String targetType = link.getRelatedEntity().getType();
|
||||
|
||||
final List<String> metadata = Lists.newArrayList();
|
||||
switch (EntityType.valueOf(targetType)) {
|
||||
case publication:
|
||||
case dataset:
|
||||
case otherresearchproduct:
|
||||
case software:
|
||||
if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) {
|
||||
metadata.add(mapStructuredProperty("title", re.getTitle()));
|
||||
}
|
||||
if (isNotBlank(re.getDateofacceptance())) {
|
||||
metadata.add(asXmlElement("dateofacceptance", re.getDateofacceptance()));
|
||||
}
|
||||
if (isNotBlank(re.getPublisher())) {
|
||||
metadata.add(asXmlElement("publisher", re.getPublisher()));
|
||||
}
|
||||
if (isNotBlank(re.getCodeRepositoryUrl())) {
|
||||
metadata.add(asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
|
||||
}
|
||||
if (re.getResulttype() != null & !re.getResulttype().isBlank()) {
|
||||
metadata.add(mapQualifier("resulttype", re.getResulttype()));
|
||||
}
|
||||
if (re.getCollectedfrom() != null) {
|
||||
metadata.addAll(re.getCollectedfrom()
|
||||
.stream()
|
||||
.map(kv -> mapKeyValue("collectedfrom", kv))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (re.getPid() != null) {
|
||||
metadata.addAll(re.getPid()
|
||||
.stream()
|
||||
.map(p -> mapStructuredProperty("pid", p))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
break;
|
||||
case datasource:
|
||||
if (isNotBlank(re.getOfficialname())) {
|
||||
metadata.add(asXmlElement("officialname", re.getOfficialname()));
|
||||
}
|
||||
if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) {
|
||||
mapDatasourceType(metadata, re.getDatasourcetype());
|
||||
}
|
||||
if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) {
|
||||
metadata.add(mapQualifier("openairecompatibility", re.getOpenairecompatibility()));
|
||||
}
|
||||
break;
|
||||
case organization:
|
||||
if (isNotBlank(re.getLegalname())) {
|
||||
metadata.add(asXmlElement("legalname", re.getLegalname()));
|
||||
}
|
||||
if (isNotBlank(re.getLegalshortname())) {
|
||||
metadata.add(asXmlElement("legalshortname", re.getLegalshortname()));
|
||||
}
|
||||
if (re.getCountry() != null & !re.getCountry().isBlank()) {
|
||||
metadata.add(mapQualifier("country", re.getCountry()));
|
||||
}
|
||||
break;
|
||||
case project:
|
||||
if (isNotBlank(re.getProjectTitle())) {
|
||||
metadata.add(asXmlElement("title", re.getProjectTitle()));
|
||||
}
|
||||
if (isNotBlank(re.getCode())) {
|
||||
metadata.add(asXmlElement("code", re.getCode()));
|
||||
}
|
||||
if (isNotBlank(re.getAcronym())) {
|
||||
metadata.add(asXmlElement("acronym", re.getAcronym()));
|
||||
}
|
||||
if (re.getContracttype() != null & !re.getContracttype().isBlank()) {
|
||||
metadata.add(mapQualifier("contracttype", re.getContracttype()));
|
||||
}
|
||||
if (re.getFundingtree() != null) {
|
||||
metadata.addAll(re.getFundingtree()
|
||||
.stream()
|
||||
.peek(ft -> fillContextMap(ft))
|
||||
.map(ft -> getRelFundingTree(ft))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid target type: " + targetType);
|
||||
|
||||
}
|
||||
final DataInfo info = rel.getDataInfo();
|
||||
|
||||
rels.add(templateFactory.getRel(
|
||||
targetType,
|
||||
rel.getTarget(),
|
||||
Sets.newHashSet(metadata),
|
||||
getInverseRelClass(rel.getRelClass()),
|
||||
getScheme(targetType, re.getType()),
|
||||
info));
|
||||
}
|
||||
return rels;
|
||||
}
|
||||
|
||||
private List<String> listChildren(final JoinedEntity je, TemplateFactory templateFactory) {
|
||||
|
||||
final List<String> children = Lists.newArrayList();
|
||||
|
||||
if (MainEntityType.result.toString().equals(getMainType(je.getType()))) {
|
||||
final List<Instance> instances = ((Result) je.getEntity()).getInstance();
|
||||
if (instances != null) {
|
||||
for (final Instance instance : ((Result) je.getEntity()).getInstance()) {
|
||||
|
||||
final List<String> fields = Lists.newArrayList();
|
||||
|
||||
if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) {
|
||||
fields.add(mapQualifier("accessright", instance.getAccessright()));
|
||||
}
|
||||
if (instance.getCollectedfrom() != null) {
|
||||
fields.add(mapKeyValue("collectedfrom", instance.getCollectedfrom()));
|
||||
}
|
||||
if (instance.getHostedby() != null) {
|
||||
fields.add(mapKeyValue("hostedby", instance.getHostedby()));
|
||||
}
|
||||
if (instance.getDateofacceptance() != null && isNotBlank(instance.getDateofacceptance().getValue())) {
|
||||
fields.add(asXmlElement("dateofacceptance", instance.getDateofacceptance().getValue()));
|
||||
}
|
||||
if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) {
|
||||
fields.add(mapQualifier("instancetype", instance.getInstancetype()));
|
||||
}
|
||||
if (isNotBlank(instance.getDistributionlocation())) {
|
||||
fields.add(asXmlElement("distributionlocation", instance.getDistributionlocation()));
|
||||
}
|
||||
if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) {
|
||||
fields.add(asXmlElement("refereed", instance.getRefereed().getValue()));
|
||||
}
|
||||
if (instance.getProcessingchargeamount() != null && isNotBlank(instance.getProcessingchargeamount().getValue())) {
|
||||
fields.add(asXmlElement("processingchargeamount", instance.getProcessingchargeamount().getValue()));
|
||||
}
|
||||
if (instance.getProcessingchargecurrency() != null && isNotBlank(instance.getProcessingchargecurrency().getValue())) {
|
||||
fields.add(asXmlElement("processingchargecurrency", instance.getProcessingchargecurrency().getValue()));
|
||||
}
|
||||
|
||||
children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl()));
|
||||
}
|
||||
}
|
||||
final List<ExternalReference> ext = ((Result) je.getEntity()).getExternalReference();
|
||||
if (ext != null) {
|
||||
for (final ExternalReference er : ((Result) je.getEntity()).getExternalReference()) {
|
||||
|
||||
final List<String> fields = Lists.newArrayList();
|
||||
|
||||
if (isNotBlank(er.getSitename())) {
|
||||
fields.add(asXmlElement("sitename", er.getSitename()));
|
||||
}
|
||||
if (isNotBlank(er.getLabel())) {
|
||||
fields.add(asXmlElement("label", er.getLabel()));
|
||||
}
|
||||
if (isNotBlank(er.getUrl())) {
|
||||
fields.add(asXmlElement("url", er.getUrl()));
|
||||
}
|
||||
if (isNotBlank(er.getDescription())) {
|
||||
fields.add(asXmlElement("description", er.getDescription()));
|
||||
}
|
||||
if (isNotBlank(er.getUrl())) {
|
||||
fields.add(mapQualifier("qualifier", er.getQualifier()));
|
||||
}
|
||||
if (isNotBlank(er.getRefidentifier())) {
|
||||
fields.add(asXmlElement("refidentifier", er.getRefidentifier()));
|
||||
}
|
||||
if (isNotBlank(er.getQuery())) {
|
||||
fields.add(asXmlElement("query", er.getQuery()));
|
||||
}
|
||||
|
||||
children.add(templateFactory.getChild("externalreference", null, fields));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return children;
|
||||
}
|
||||
|
||||
private List<String> listExtraInfo(JoinedEntity je) {
|
||||
final List<ExtraInfo> extraInfo = je.getEntity().getExtraInfo();
|
||||
return extraInfo != null ? extraInfo
|
||||
.stream()
|
||||
.map(e -> mapExtraInfo(e))
|
||||
.collect(Collectors.toList()) : Lists.newArrayList();
|
||||
}
|
||||
|
||||
private List<String> buildContexts(final String type) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) {
|
||||
|
||||
XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot");
|
||||
|
||||
for (final String context : contextes) {
|
||||
|
||||
String id = "";
|
||||
for (final String token : Splitter.on("::").split(context)) {
|
||||
id += token;
|
||||
|
||||
final ContextDef def = contextMapper.get(id);
|
||||
|
||||
if (def == null) {
|
||||
continue;
|
||||
// throw new IllegalStateException(String.format("cannot find context for id '%s'", id));
|
||||
}
|
||||
|
||||
if (def.getName().equals("context")) {
|
||||
final String xpath = "//context/@id='" + def.getId() + "'";
|
||||
if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) {
|
||||
document = addContextDef(document.gotoRoot(), def);
|
||||
}
|
||||
}
|
||||
|
||||
if (def.getName().equals("category")) {
|
||||
final String rootId = substringBefore(def.getId(), "::");
|
||||
document = addContextDef(document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), def);
|
||||
}
|
||||
|
||||
if (def.getName().equals("concept")) {
|
||||
document = addContextDef(document, def).gotoParent();
|
||||
}
|
||||
id += "::";
|
||||
}
|
||||
}
|
||||
final Transformer transformer = getTransformer();
|
||||
for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) {
|
||||
try {
|
||||
res.add(asStringElement(x, transformer));
|
||||
} catch (final TransformerException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private Transformer getTransformer() {
|
||||
try {
|
||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||
return transformer;
|
||||
} catch (TransformerConfigurationException e) {
|
||||
throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e);
|
||||
}
|
||||
}
|
||||
|
||||
private XMLTag addContextDef(final XMLTag tag, final ContextDef def) {
|
||||
tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel());
|
||||
if ((def.getType() != null) && !def.getType().isEmpty()) {
|
||||
tag.addAttribute("type", def.getType());
|
||||
}
|
||||
return tag;
|
||||
}
|
||||
|
||||
private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) throws TransformerException {
|
||||
final StringWriter buffer = new StringWriter();
|
||||
transformer.transform(new DOMSource(element), new StreamResult(buffer));
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
private void fillContextMap(final String xmlTree) {
|
||||
|
||||
Document fundingPath;
|
||||
try {
|
||||
fundingPath = new SAXReader().read(new StringReader(xmlTree));
|
||||
} catch (final DocumentException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
try {
|
||||
final Node funder = fundingPath.selectSingleNode("//funder");
|
||||
|
||||
if (funder != null) {
|
||||
|
||||
final String funderShortName = funder.valueOf("./shortname");
|
||||
contextes.add(funderShortName);
|
||||
|
||||
contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding"));
|
||||
final Node level0 = fundingPath.selectSingleNode("//funding_level_0");
|
||||
if (level0 != null) {
|
||||
final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name"));
|
||||
contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", ""));
|
||||
final Node level1 = fundingPath.selectSingleNode("//funding_level_1");
|
||||
if (level1 == null) {
|
||||
contextes.add(level0Id);
|
||||
} else {
|
||||
final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name"));
|
||||
contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", ""));
|
||||
final Node level2 = fundingPath.selectSingleNode("//funding_level_2");
|
||||
if (level2 == null) {
|
||||
contextes.add(level1Id);
|
||||
} else {
|
||||
final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name"));
|
||||
contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", ""));
|
||||
contextes.add(level2Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (final NullPointerException e) {
|
||||
throw new IllegalArgumentException("malformed funding path: " + xmlTree, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private String getRelFundingTree(final String xmlTree) {
|
||||
String funding = "<funding>";
|
||||
try {
|
||||
final Document ftree = new SAXReader().read(new StringReader(xmlTree));
|
||||
funding = "<funding>";
|
||||
|
||||
funding += getFunderElement(ftree);
|
||||
|
||||
for (final Object o : Lists.reverse(ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) {
|
||||
final Element e = (Element) o;
|
||||
final String _id = e.valueOf("./id");
|
||||
funding += "<" + e.getName() + " name=\"" + escapeXml(e.valueOf("./name")) + "\">" + escapeXml(_id) + "</" + e.getName() + ">";
|
||||
}
|
||||
} catch (final DocumentException e) {
|
||||
throw new IllegalArgumentException("unable to parse funding tree: " + xmlTree + "\n" + e.getMessage());
|
||||
} finally {
|
||||
funding += "</funding>";
|
||||
}
|
||||
return funding;
|
||||
}
|
||||
|
||||
private String getFunderElement(final Document ftree) {
|
||||
final String funderId = ftree.valueOf("//fundingtree/funder/id/text()");
|
||||
final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname/text()");
|
||||
final String funderName = ftree.valueOf("//fundingtree/funder/name/text()");
|
||||
final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction/text()");
|
||||
|
||||
return "<funder id=\"" + escapeXml(funderId) + "\" shortname=\"" + escapeXml(funderShortName) + "\" name=\"" + escapeXml(funderName)
|
||||
+ "\" jurisdiction=\"" + escapeXml(funderJurisdiction) + "\" />";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix;
|
||||
import static org.apache.commons.lang3.StringUtils.isBlank;
|
||||
import static org.apache.commons.lang3.StringUtils.isNotBlank;
|
||||
|
||||
public class XmlSerializationUtils {
|
||||
|
||||
// XML 1.0
|
||||
// #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||
private final static String xml10pattern = "[^"
|
||||
+ "\u0009\r\n"
|
||||
+ "\u0020-\uD7FF"
|
||||
+ "\uE000-\uFFFD"
|
||||
+ "\ud800\udc00-\udbff\udfff"
|
||||
+ "]";
|
||||
|
||||
public static String mapJournal(Journal j) {
|
||||
final String attrs = new StringBuilder()
|
||||
.append(attr("issn", j.getIssnPrinted()))
|
||||
.append(attr("eissn", j.getIssnOnline()))
|
||||
.append(attr("lissn", j.getIssnLinking()))
|
||||
.append(attr("ep", j.getEp()))
|
||||
.append(attr("iss", j.getIss()))
|
||||
.append(attr("sp", j.getSp()))
|
||||
.append(attr("vol", j.getVol()))
|
||||
.toString()
|
||||
.trim();
|
||||
|
||||
return new StringBuilder()
|
||||
.append("<journal")
|
||||
.append(isNotBlank(attrs) ? (" " + attrs) : "")
|
||||
.append(">")
|
||||
.append(escapeXml(j.getName()))
|
||||
.append("</journal>")
|
||||
.toString();
|
||||
}
|
||||
|
||||
private static String attr(final String name, final String value) {
|
||||
return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : "";
|
||||
}
|
||||
|
||||
public static String mapStructuredProperty(String name, StructuredProperty t) {
|
||||
return asXmlElement(name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null);
|
||||
}
|
||||
|
||||
public static String mapQualifier(String name, Qualifier q) {
|
||||
return asXmlElement(name, "", q, null);
|
||||
}
|
||||
|
||||
public static String escapeXml(final String value) {
|
||||
return value
|
||||
.replaceAll("&", "&")
|
||||
.replaceAll("<", "<")
|
||||
.replaceAll(">", ">")
|
||||
.replaceAll("\"", """)
|
||||
.replaceAll("'", "'")
|
||||
.replaceAll(xml10pattern, "");
|
||||
}
|
||||
|
||||
public static String parseDataInfo(final DataInfo dataInfo) {
|
||||
return new StringBuilder()
|
||||
.append("<datainfo>")
|
||||
.append(asXmlElement("inferred", dataInfo.getInferred() + ""))
|
||||
.append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + ""))
|
||||
.append(asXmlElement("trust", dataInfo.getTrust() + ""))
|
||||
.append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + ""))
|
||||
.append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null))
|
||||
.append("</datainfo>")
|
||||
.toString();
|
||||
}
|
||||
|
||||
private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) {
|
||||
return sb
|
||||
.append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : ""))
|
||||
.append(attr("inferenceprovenance", info.getInferenceprovenance()))
|
||||
.append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : ""))
|
||||
.append(attr("trust", info.getTrust()));
|
||||
}
|
||||
|
||||
public static String mapKeyValue(final String name, final KeyValue kv) {
|
||||
return new StringBuilder()
|
||||
.append("<")
|
||||
.append(name)
|
||||
.append(" name=\"")
|
||||
.append(escapeXml(kv.getValue()))
|
||||
.append("\" id=\"")
|
||||
.append(escapeXml(removePrefix(kv.getKey())))
|
||||
.append("\"/>")
|
||||
.toString();
|
||||
}
|
||||
|
||||
public static String mapExtraInfo(final ExtraInfo e) {
|
||||
return new StringBuilder("<extraInfo ")
|
||||
.append("name=\"" + e.getName() + "\" ")
|
||||
.append("typology=\"" + e.getTypology() + "\" ")
|
||||
.append("provenance=\"" + e.getProvenance() + "\" ")
|
||||
.append("trust=\"" + e.getTrust() + "\"")
|
||||
.append(">")
|
||||
.append(e.getValue())
|
||||
.append("</extraInfo>")
|
||||
.toString();
|
||||
}
|
||||
|
||||
public static String asXmlElement(final String name, final String value) {
|
||||
return asXmlElement(name, value, null, null);
|
||||
}
|
||||
|
||||
public static String asXmlElement(final String name, final String value, final Qualifier q, final DataInfo info) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("<");
|
||||
sb.append(name);
|
||||
if (q != null) {
|
||||
sb.append(getAttributes(q));
|
||||
}
|
||||
if (info != null) {
|
||||
sb
|
||||
.append(" ")
|
||||
.append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : ""))
|
||||
.append(attr("inferenceprovenance", info.getInferenceprovenance()))
|
||||
.append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : ""))
|
||||
.append(attr("trust", info.getTrust()));
|
||||
}
|
||||
if (isBlank(value)) {
|
||||
sb.append("/>");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
sb.append(">");
|
||||
sb.append(escapeXml(value));
|
||||
sb.append("</");
|
||||
sb.append(name);
|
||||
sb.append(">");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static String getAttributes(final Qualifier q) {
|
||||
if (q == null || q.isBlank()) return "";
|
||||
|
||||
return new StringBuilder(" ")
|
||||
.append(attr("classid", q.getClassid()))
|
||||
.append(attr("classname", q.getClassname()))
|
||||
.append(attr("schemeid", q.getSchemeid()))
|
||||
.append(attr("schemename", q.getSchemename()))
|
||||
.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
net.sf.saxon.TransformerFactoryImpl
|
|
@ -0,0 +1,6 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,7 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read the XML records", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"format", "paramDescription": "MDFormat name found in the IS profile", "paramRequired": true},
|
||||
{"paramName":"b", "paramLongName":"batchSize", "paramDescription": "size of the batch of documents sent to solr", "paramRequired": false}
|
||||
]
|
|
@ -0,0 +1,34 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_db_name</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/applicationHistory</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,99 @@
|
|||
<workflow-app name="index_infospace_graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>hive_db_name</name>
|
||||
<description>the target hive database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="reuse_records"/>
|
||||
|
||||
<decision name="reuse_records">
|
||||
<switch>
|
||||
<case to="adjancency_lists">${wf:conf('reuseRecords') eq false}</case>
|
||||
<case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
|
||||
<default to="adjancency_lists"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="adjancency_lists">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>build_adjacency_lists</name>
|
||||
<class>eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn</arg>
|
||||
<arg>-is</arg> <arg>${isLookupUrl}</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="to_solr_index"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="to_solr_index">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>to_solr_index</name>
|
||||
<class>eu.dnetlib.dhp.graph.SparkXmlIndexingJob</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn</arg>
|
||||
<arg>-is</arg> <arg>${isLookupUrl}</arg>
|
||||
<arg>--sourcePath</arg><arg>${outputPath}/xml</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--batchSize</arg><arg>${batchSize}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,3 @@
|
|||
<name$if(hasId)$ objidentifier="$id$"$else$$endif>>
|
||||
$metadata:{ it | $it$ }$
|
||||
</name>
|
|
@ -0,0 +1,10 @@
|
|||
<oaf:$name$>
|
||||
$metadata:{ it | $it$ }$
|
||||
<rels>
|
||||
$rels:{ it | $it$ }$
|
||||
</rels>
|
||||
<children>
|
||||
$children:{ it | $it$ }$
|
||||
</children>
|
||||
</oaf:$name$>
|
||||
$extrainfo:{ it | $it$ }$
|
|
@ -0,0 +1,4 @@
|
|||
<instance id="$instanceId$">
|
||||
$metadata:{ it | $it$ }$
|
||||
$webresources:{ it | $it$ }$
|
||||
</instance>
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0"?>
|
||||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>$id$</dri:objIdentifier>
|
||||
<dri:dateOfCollection>$dateofcollection$</dri:dateOfCollection>
|
||||
<dri:dateOfTransformation>$dateoftransformation$</dri:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf $schemaLocation$">
|
||||
$it$
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,4 @@
|
|||
<rel inferred="$inferred$" trust="$trust$" inferenceprovenance="$inferenceprovenance$" provenanceaction="$provenanceaction$">
|
||||
<to class="$class$" scheme="$scheme$" type="$type$">$objIdentifier$</to>
|
||||
$metadata:{ it | $it$ }$
|
||||
</rel>
|
|
@ -0,0 +1,3 @@
|
|||
<webresource>
|
||||
<url>$identifier$</url>
|
||||
</webresource>
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import eu.dnetlib.dhp.graph.model.EntityRelEntity;
|
||||
import eu.dnetlib.dhp.graph.model.RelatedEntity;
|
||||
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
public class MappingUtilsTest {
|
||||
|
||||
private GraphMappingUtils utils;
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
utils = new GraphMappingUtils();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOafMappingDatasource() throws IOException {
|
||||
|
||||
final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("datasource.json"));
|
||||
final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class);
|
||||
e.getSource().setType("datasource");
|
||||
|
||||
final EntityRelEntity out = utils.asRelatedEntity(e);
|
||||
System.out.println(out);
|
||||
|
||||
}
|
||||
|
||||
//@Test
|
||||
public void testOafMappingResult() throws IOException {
|
||||
|
||||
final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("result.json"));
|
||||
final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class);
|
||||
|
||||
final EntityRelEntity out = utils.asRelatedEntity(e);
|
||||
System.out.println(out);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOafMappingSoftware() throws IOException {
|
||||
|
||||
final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("software.json"));
|
||||
final EntityRelEntity e = new ObjectMapper().readValue(in, EntityRelEntity.class);
|
||||
|
||||
final EntityRelEntity out = utils.asRelatedEntity(e);
|
||||
System.out.println(out);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testParseRelatedEntity() throws IOException {
|
||||
|
||||
final InputStreamReader in = new InputStreamReader(getClass().getResourceAsStream("related_entity.json"));
|
||||
final RelatedEntity e = new ObjectMapper().readValue(in, RelatedEntity.class);
|
||||
|
||||
System.out.println(e);
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import eu.dnetlib.dhp.graph.utils.ContextMapper;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class XmlRecordFactoryTest {
|
||||
|
||||
private static final Log log = LogFactory.getLog(XmlRecordFactoryTest.class);
|
||||
|
||||
private Path testDir;
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
testDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||
log.info("created test directory " + testDir.toString());
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testDir.toFile());
|
||||
log.info("deleted test directory " + testDir.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testXmlSerialization() throws Exception {
|
||||
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
|
||||
.master("local[*]")
|
||||
.getOrCreate();
|
||||
|
||||
final String inputDir = testDir.toString() + "/3_joined_entities";
|
||||
FileUtils.forceMkdir(new File(inputDir));
|
||||
FileUtils.copyFile(new File("/Users/claudio/Downloads/joined_entities-part-00000"), new File(inputDir + "/joined_entities-part-00000"));
|
||||
|
||||
final ContextMapper ctx = ContextMapper.fromIS("https://dev-openaire.d4science.org:443/is/services/isLookUp");
|
||||
|
||||
final GraphJoiner g = new GraphJoiner(spark, ctx, inputDir, testDir.toString());
|
||||
|
||||
g.asXML();
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
"id": "20|nih_________::6b8108b6d6399f7163a6a7ccdd0efc2d",
|
||||
"type": "organization",
|
||||
"legalname": "MCGILL UNIVERSITY"
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -10,9 +10,8 @@ This module is automatically executed when running:
|
|||
on module having set:
|
||||
|
||||
<parent>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dhp-wf</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
</parent>
|
||||
|
||||
in `pom.xml` file. `oozie-package` profile initializes oozie workflow packaging, `workflow.source.dir` property points to a workflow (notice: this is not a relative path but a classpath to directory usually holding `oozie_app` subdirectory).
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
@ -20,6 +20,7 @@
|
|||
<module>dhp-dedup</module>
|
||||
<module>dhp-bulktag</module>
|
||||
<module>dhp-propagation</module>
|
||||
<module>dhp-graph-provision</module>
|
||||
</modules>
|
||||
|
||||
<pluginRepositories>
|
||||
|
|
114
pom.xml
114
pom.xml
|
@ -1,9 +1,11 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.0.5-SNAPSHOT</version>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<url>http://www.d-net.research-infrastructures.eu</url>
|
||||
|
@ -96,6 +98,12 @@
|
|||
<version>${dhp.hadoop.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
<version>${dhp.hadoop.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
|
@ -149,7 +157,7 @@
|
|||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
<version>9.5.1-5</version>
|
||||
<version>9.9.1-6</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -170,6 +178,56 @@
|
|||
<version>1.1.6</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.mycila.xmltool</groupId>
|
||||
<artifactId>xmltool</artifactId>
|
||||
<version>3.3</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-solrj</artifactId>
|
||||
<version>7.5.0</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>*</artifactId>
|
||||
<groupId>*</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.lucidworks.spark</groupId>
|
||||
<artifactId>spark-solr</artifactId>
|
||||
<version>3.6.0</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>*</artifactId>
|
||||
<groupId>*</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpmime</artifactId>
|
||||
<version>4.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.noggit</groupId>
|
||||
<artifactId>noggit</artifactId>
|
||||
<version>0.8</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
<version>3.4.11</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.schmizz</groupId>
|
||||
<artifactId>sshj</artifactId>
|
||||
|
@ -200,10 +258,19 @@
|
|||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-pace-core</artifactId>
|
||||
<version>4.0.0-SNAPSHOT</version>
|
||||
<version>4.0.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-rmi-api</artifactId>
|
||||
<version>[2.0.0,3.0.0)</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.cxf</groupId>
|
||||
<artifactId>cxf-rt-transports-http</artifactId>
|
||||
<version>3.1.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>javax.persistence</groupId>
|
||||
<artifactId>javax.persistence-api</artifactId>
|
||||
|
@ -231,6 +298,16 @@
|
|||
<artifactId>secondstring</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
<version>${mongodb.driver.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
<version>4.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.oozie</groupId>
|
||||
|
@ -349,31 +426,7 @@
|
|||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
|
||||
<extensions>
|
||||
|
@ -421,6 +474,7 @@
|
|||
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||
<scala.version>2.11.12</scala.version>
|
||||
<mongodb.driver.version>3.4.2</mongodb.driver.version>
|
||||
</properties>
|
||||
</project>
|
||||
|
||||
|
|
Loading…
Reference in New Issue