conflicts resolved, merged from beta
commit
a87c070447
@ -0,0 +1,21 @@
|
||||
style = defaultWithAlign
|
||||
|
||||
align.openParenCallSite = false
|
||||
align.openParenDefnSite = false
|
||||
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
|
||||
continuationIndent.callSite = 2
|
||||
continuationIndent.defnSite = 2
|
||||
danglingParentheses = true
|
||||
indentOperator = spray
|
||||
maxColumn = 120
|
||||
newlines.alwaysBeforeTopLevelStatements = true
|
||||
project.excludeFilters = [".*\\.sbt"]
|
||||
rewrite.rules = [AvoidInfix]
|
||||
rewrite.rules = [ExpandImportSelectors]
|
||||
rewrite.rules = [RedundantBraces]
|
||||
rewrite.rules = [RedundantParens]
|
||||
rewrite.rules = [SortImports]
|
||||
rewrite.rules = [SortModifiers]
|
||||
rewrite.rules = [PreferCurlyFors]
|
||||
spaces.inImportCurlyBraces = false
|
||||
unindentTopLevelOperators = true
|
@ -1,72 +0,0 @@
|
||||
package eu.dnetlib.dhp.application
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
/**
|
||||
* This is the main Interface SparkApplication
|
||||
* where all the Spark Scala class should inherit
|
||||
*
|
||||
*/
|
||||
trait SparkScalaApplication {
|
||||
/**
|
||||
* This is the path in the classpath of the json
|
||||
* describes all the argument needed to run
|
||||
*/
|
||||
val propertyPath: String
|
||||
|
||||
/**
|
||||
* Utility to parse the arguments using the
|
||||
* property json in the classpath identified from
|
||||
* the variable propertyPath
|
||||
*
|
||||
* @param args the list of arguments
|
||||
*/
|
||||
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
|
||||
parser.parseArgument(args)
|
||||
parser
|
||||
}
|
||||
|
||||
/**
|
||||
* Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
def run(): Unit
|
||||
}
|
||||
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.slf4j.Logger
|
||||
|
||||
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
|
||||
|
||||
var parser: ArgumentApplicationParser = null
|
||||
|
||||
var spark:SparkSession = null
|
||||
|
||||
|
||||
def initialize():SparkScalaApplication = {
|
||||
parser = parseArguments(args)
|
||||
spark = createSparkSession()
|
||||
this
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility for creating a spark session starting from parser
|
||||
*
|
||||
* @return a spark Session
|
||||
*/
|
||||
private def createSparkSession():SparkSession = {
|
||||
require(parser!= null)
|
||||
|
||||
val conf:SparkConf = new SparkConf()
|
||||
val master = parser.get("master")
|
||||
log.info(s"Creating Spark session: Master: $master")
|
||||
SparkSession.builder().config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
}
|
||||
|
||||
}
|
@ -1,413 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class GraphResultMapper implements Serializable {
|
||||
|
||||
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||
E in) {
|
||||
|
||||
CommunityResult out = new CommunityResult();
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
switch (ort.get().getClassid()) {
|
||||
case "publication":
|
||||
Optional<Journal> journal = Optional
|
||||
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
||||
if (journal.isPresent()) {
|
||||
Journal j = journal.get();
|
||||
Container c = new Container();
|
||||
c.setConferencedate(j.getConferencedate());
|
||||
c.setConferenceplace(j.getConferenceplace());
|
||||
c.setEdition(j.getEdition());
|
||||
c.setEp(j.getEp());
|
||||
c.setIss(j.getIss());
|
||||
c.setIssnLinking(j.getIssnLinking());
|
||||
c.setIssnOnline(j.getIssnOnline());
|
||||
c.setIssnPrinted(j.getIssnPrinted());
|
||||
c.setName(j.getName());
|
||||
c.setSp(j.getSp());
|
||||
c.setVol(j.getVol());
|
||||
out.setContainer(c);
|
||||
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
|
||||
}
|
||||
break;
|
||||
case "dataset":
|
||||
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
|
||||
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
|
||||
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
|
||||
|
||||
out
|
||||
.setGeolocation(
|
||||
Optional
|
||||
.ofNullable(id.getGeolocation())
|
||||
.map(
|
||||
igl -> igl
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(gli -> {
|
||||
GeoLocation gl = new GeoLocation();
|
||||
gl.setBox(gli.getBox());
|
||||
gl.setPlace(gli.getPlace());
|
||||
gl.setPoint(gli.getPoint());
|
||||
return gl;
|
||||
})
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "software":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
|
||||
Optional
|
||||
.ofNullable(is.getCodeRepositoryUrl())
|
||||
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(is.getDocumentationUrl())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setDocumentationUrl(
|
||||
value
|
||||
.stream()
|
||||
.map(Field::getValue)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(is.getProgrammingLanguage())
|
||||
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
|
||||
|
||||
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "other":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
|
||||
out
|
||||
.setContactgroup(
|
||||
Optional
|
||||
.ofNullable(ir.getContactgroup())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out
|
||||
.setContactperson(
|
||||
Optional
|
||||
.ofNullable(ir.getContactperson())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setTool(
|
||||
Optional
|
||||
.ofNullable(ir.getTool())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getAuthor())
|
||||
.ifPresent(
|
||||
ats -> out.setAuthor(ats.stream().map(GraphResultMapper::getAuthor).collect(Collectors.toList())));
|
||||
|
||||
// I do not map Access Right UNKNOWN or OTHER
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||
if (oar.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
|
||||
out
|
||||
.setBestaccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
final List<String> contributorList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getContributor())
|
||||
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||
out.setContributor(contributorList);
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getCountry())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setCountry(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
c -> {
|
||||
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
|
||||
return null;
|
||||
}
|
||||
Country country = new Country();
|
||||
country.setCode(c.getClassid());
|
||||
country.setLabel(c.getClassname());
|
||||
Optional
|
||||
.ofNullable(c.getDataInfo())
|
||||
.ifPresent(
|
||||
provenance -> country
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
provenance
|
||||
.getProvenanceaction()
|
||||
.getClassname(),
|
||||
c.getDataInfo().getTrust())));
|
||||
return country;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
final List<String> coverageList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getCoverage())
|
||||
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||
out.setCoverage(coverageList);
|
||||
|
||||
out.setDateofcollection(input.getDateofcollection());
|
||||
|
||||
final List<String> descriptionList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getDescription())
|
||||
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||
out.setDescription(descriptionList);
|
||||
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||
if (oStr.isPresent()) {
|
||||
out.setEmbargoenddate(oStr.get().getValue());
|
||||
}
|
||||
|
||||
final List<String> formatList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getFormat())
|
||||
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||
out.setFormat(formatList);
|
||||
out.setId(input.getId());
|
||||
out.setOriginalId(input.getOriginalId());
|
||||
|
||||
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
|
||||
.ofNullable(input.getInstance());
|
||||
|
||||
if (oInst.isPresent()) {
|
||||
out
|
||||
.setInstance(
|
||||
oInst.get().stream().map(GraphResultMapper::getInstance).collect(Collectors.toList()));
|
||||
|
||||
}
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||
if (oL.isPresent()) {
|
||||
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
|
||||
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
|
||||
}
|
||||
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||
if (oLong.isPresent()) {
|
||||
out.setLastupdatetimestamp(oLong.get());
|
||||
}
|
||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||
if (otitle.isPresent()) {
|
||||
List<StructuredProperty> iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setMaintitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setSubtitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
List<ControlledField> pids = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getPid())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.stream()
|
||||
.forEach(
|
||||
p -> pids
|
||||
.add(
|
||||
ControlledField
|
||||
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
|
||||
out.setPid(pids);
|
||||
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublicationdate(oStr.get().getValue());
|
||||
}
|
||||
oStr = Optional.ofNullable(input.getPublisher());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublisher(oStr.get().getValue());
|
||||
}
|
||||
|
||||
List<String> sourceList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSource())
|
||||
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
|
||||
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
||||
List<Subject> subjectList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSubject())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.forEach(s -> subjectList.add(getSubject(s))));
|
||||
|
||||
out.setSubjects(subjectList);
|
||||
|
||||
out.setType(input.getResulttype().getClassid());
|
||||
}
|
||||
|
||||
out
|
||||
.setCollectedfrom(
|
||||
input
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return out;
|
||||
|
||||
}
|
||||
|
||||
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
CommunityInstance instance = new CommunityInstance();
|
||||
|
||||
setCommonValue(i, instance);
|
||||
|
||||
instance
|
||||
.setCollectedfrom(
|
||||
KeyValue
|
||||
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
||||
|
||||
instance
|
||||
.setHostedby(
|
||||
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
||||
|
||||
return instance;
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
|
||||
.ofNullable(i.getAccessright());
|
||||
if (opAr.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
|
||||
instance
|
||||
.setAccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getLicense())
|
||||
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getDateofacceptance())
|
||||
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getRefereed())
|
||||
.ifPresent(value -> instance.setRefereed(value.getClassname()));
|
||||
Optional
|
||||
.ofNullable(i.getInstancetype())
|
||||
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||
|
||||
}
|
||||
|
||||
private static Subject getSubject(StructuredProperty s) {
|
||||
Subject subject = new Subject();
|
||||
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
|
||||
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
Provenance p = new Provenance();
|
||||
p.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||
p.setTrust(di.get().getTrust());
|
||||
subject.setProvenance(p);
|
||||
}
|
||||
|
||||
return subject;
|
||||
}
|
||||
|
||||
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||
Author a = new Author();
|
||||
a.setFullname(oa.getFullname());
|
||||
a.setName(oa.getName());
|
||||
a.setSurname(oa.getSurname());
|
||||
a.setRank(oa.getRank());
|
||||
|
||||
Optional<List<StructuredProperty>> oPids = Optional
|
||||
.ofNullable(oa.getPid());
|
||||
if (oPids.isPresent()) {
|
||||
Pid pid = getOrcid(oPids.get());
|
||||
if (pid != null) {
|
||||
a.setPid(pid);
|
||||
}
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()),
|
||||
Provenance
|
||||
.newInstance(
|
||||
di.get().getProvenanceaction().getClassname(),
|
||||
di.get().getTrust()));
|
||||
} else {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue())
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,73 @@
|
||||
package eu.dnetlib.dhp.application
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
/** This is the main Interface SparkApplication
|
||||
* where all the Spark Scala class should inherit
|
||||
*/
|
||||
trait SparkScalaApplication {
|
||||
|
||||
/** This is the path in the classpath of the json
|
||||
* describes all the argument needed to run
|
||||
*/
|
||||
val propertyPath: String
|
||||
|
||||
/** Utility to parse the arguments using the
|
||||
* property json in the classpath identified from
|
||||
* the variable propertyPath
|
||||
*
|
||||
* @param args the list of arguments
|
||||
*/
|
||||
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
parser
|
||||
}
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
def run(): Unit
|
||||
}
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.slf4j.Logger
|
||||
|
||||
abstract class AbstractScalaApplication(
|
||||
val propertyPath: String,
|
||||
val args: Array[String],
|
||||
log: Logger
|
||||
) extends SparkScalaApplication {
|
||||
|
||||
var parser: ArgumentApplicationParser = null
|
||||
|
||||
var spark: SparkSession = null
|
||||
|
||||
def initialize(): SparkScalaApplication = {
|
||||
parser = parseArguments(args)
|
||||
spark = createSparkSession()
|
||||
this
|
||||
}
|
||||
|
||||
/** Utility for creating a spark session starting from parser
|
||||
*
|
||||
* @return a spark Session
|
||||
*/
|
||||
private def createSparkSession(): SparkSession = {
|
||||
require(parser != null)
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val master = parser.get("master")
|
||||
log.info(s"Creating Spark session: Master: $master")
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,442 @@
|
||||
package eu.dnetlib.dhp.sx.graph.scholix
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.sx.scholix._
|
||||
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
|
||||
object ScholixUtils extends Serializable {
|
||||
|
||||
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
|
||||
|
||||
val DATE_RELATION_KEY: String = "RelationDate"
|
||||
|
||||
case class RelationVocabulary(original: String, inverse: String) {}
|
||||
|
||||
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
|
||||
|
||||
val relations: Map[String, RelationVocabulary] = {
|
||||
val input = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
|
||||
)
|
||||
.mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
json.extract[Map[String, RelationVocabulary]]
|
||||
}
|
||||
|
||||
def extractRelationDate(relation: Relation): String = {
|
||||
|
||||
if (relation.getProperties == null || !relation.getProperties.isEmpty)
|
||||
null
|
||||
else {
|
||||
val date = relation.getProperties.asScala
|
||||
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
|
||||
.map(p => p.getValue)
|
||||
if (date.isDefined)
|
||||
date.get
|
||||
else
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
def extractRelationDate(summary: ScholixSummary): String = {
|
||||
|
||||
if (summary.getDate == null || summary.getDate.isEmpty)
|
||||
null
|
||||
else {
|
||||
summary.getDate.get(0)
|
||||
}
|
||||
}
|
||||
|
||||
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
|
||||
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
|
||||
|
||||
}
|
||||
|
||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||
}
|
||||
|
||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
||||
override def zero: RelatedEntities = null
|
||||
|
||||
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
|
||||
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
|
||||
if (b == null)
|
||||
RelatedEntities(a._1, relatedDataset, relatedPublication)
|
||||
else
|
||||
RelatedEntities(
|
||||
a._1,
|
||||
b.relatedDataset + relatedDataset,
|
||||
b.relatedPublication + relatedPublication
|
||||
)
|
||||
}
|
||||
|
||||
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
|
||||
if (b1 != null && b2 != null)
|
||||
RelatedEntities(
|
||||
b1.id,
|
||||
b1.relatedDataset + b2.relatedDataset,
|
||||
b1.relatedPublication + b2.relatedPublication
|
||||
)
|
||||
else if (b1 != null)
|
||||
b1
|
||||
else
|
||||
b2
|
||||
}
|
||||
|
||||
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
|
||||
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
}
|
||||
|
||||
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
|
||||
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
|
||||
override def zero: Scholix = null
|
||||
|
||||
def scholix_complete(s: Scholix): Boolean = {
|
||||
if (s == null || s.getIdentifier == null) {
|
||||
false
|
||||
} else if (s.getSource == null || s.getTarget == null) {
|
||||
false
|
||||
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
|
||||
false
|
||||
else
|
||||
true
|
||||
}
|
||||
|
||||
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
|
||||
if (scholix_complete(b)) b else a._2
|
||||
}
|
||||
|
||||
override def merge(b1: Scholix, b2: Scholix): Scholix = {
|
||||
if (scholix_complete(b1)) b1 else b2
|
||||
}
|
||||
|
||||
override def finish(reduction: Scholix): Scholix = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
}
|
||||
|
||||
def createInverseScholixRelation(scholix: Scholix): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
||||
s.setSource(scholix.getTarget)
|
||||
s.setTarget(scholix.getSource)
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
|
||||
}
|
||||
|
||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
|
||||
}(collection.breakOut)
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
|
||||
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
|
||||
new ScholixEntityId(
|
||||
d.getDatasourceName,
|
||||
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
|
||||
)
|
||||
}(collection.breakOut)
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
|
||||
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
|
||||
|
||||
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
|
||||
new ScholixEntityId(
|
||||
c.getValue,
|
||||
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||
)
|
||||
}.toList
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(scholix.getRelationship)
|
||||
s.setSource(scholix.getSource)
|
||||
s.setTarget(generateScholixResourceFromSummary(target))
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
}
|
||||
|
||||
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(scholix.getRelationship)
|
||||
s.setSource(scholix.getSource)
|
||||
s.setTarget(target)
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
}
|
||||
|
||||
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
|
||||
val r = new ScholixResource
|
||||
r.setIdentifier(summaryObject.getLocalIdentifier)
|
||||
r.setDnetIdentifier(summaryObject.getId)
|
||||
|
||||
r.setObjectType(summaryObject.getTypology.toString)
|
||||
r.setObjectSubType(summaryObject.getSubType)
|
||||
|
||||
if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty)
|
||||
r.setTitle(summaryObject.getTitle.get(0))
|
||||
|
||||
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
|
||||
val l: List[ScholixEntityId] =
|
||||
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
|
||||
if (l.nonEmpty)
|
||||
r.setCreator(l.asJava)
|
||||
}
|
||||
|
||||
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
|
||||
r.setPublicationDate(summaryObject.getDate.get(0))
|
||||
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
|
||||
val plist: List[ScholixEntityId] =
|
||||
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
||||
|
||||
if (plist.nonEmpty)
|
||||
r.setPublisher(plist.asJava)
|
||||
}
|
||||
|
||||
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
|
||||
|
||||
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
|
||||
.map(c =>
|
||||
new ScholixCollectedFrom(
|
||||
new ScholixEntityId(
|
||||
c.getDatasourceName,
|
||||
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||
),
|
||||
"collected",
|
||||
"complete"
|
||||
)
|
||||
)
|
||||
.toList
|
||||
|
||||
if (l.nonEmpty)
|
||||
r.setCollectedFrom(l.asJava)
|
||||
|
||||
}
|
||||
r
|
||||
}
|
||||
|
||||
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
|
||||
if (relation == null || source == null)
|
||||
return null
|
||||
val s = new Scholix
|
||||
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
|
||||
if (l.isEmpty)
|
||||
l = extractCollectedFrom(source)
|
||||
if (l.isEmpty)
|
||||
return null
|
||||
s.setLinkprovider(l.asJava)
|
||||
var d = extractRelationDate(relation)
|
||||
if (d == null)
|
||||
d = source.getPublicationDate
|
||||
|
||||
s.setPublicationDate(d)
|
||||
|
||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||
s.setPublisher(source.getPublisher)
|
||||
}
|
||||
|
||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||
if (semanticRelation == null)
|
||||
return null
|
||||
s.setRelationship(
|
||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||
)
|
||||
s.setSource(source)
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
|
||||
|
||||
if (relation == null || source == null)
|
||||
return null
|
||||
|
||||
val s = new Scholix
|
||||
|
||||
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
|
||||
if (l.isEmpty)
|
||||
l = extractCollectedFrom(source)
|
||||
if (l.isEmpty)
|
||||
return null
|
||||
|
||||
s.setLinkprovider(l.asJava)
|
||||
|
||||
var d = extractRelationDate(relation)
|
||||
if (d == null)
|
||||
d = extractRelationDate(source)
|
||||
|
||||
s.setPublicationDate(d)
|
||||
|
||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||
val l: List[ScholixEntityId] = source.getPublisher.asScala
|
||||
.map { p =>
|
||||
new ScholixEntityId(p, null)
|
||||
}(collection.breakOut)
|
||||
|
||||
if (l.nonEmpty)
|
||||
s.setPublisher(l.asJava)
|
||||
}
|
||||
|
||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||
if (semanticRelation == null)
|
||||
return null
|
||||
s.setRelationship(
|
||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||
)
|
||||
s.setSource(generateScholixResourceFromSummary(source))
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
def findURLForPID(
|
||||
pidValue: List[StructuredProperty],
|
||||
urls: List[String]
|
||||
): List[(StructuredProperty, String)] = {
|
||||
pidValue.map { p =>
|
||||
val pv = p.getValue
|
||||
|
||||
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||
(p, r.orNull)
|
||||
}
|
||||
}
|
||||
|
||||
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
||||
if (r.getInstance() == null || r.getInstance().isEmpty)
|
||||
return List()
|
||||
r.getInstance()
|
||||
.asScala
|
||||
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||
.filter(i => i.getPid != null && i.getUrl != null)
|
||||
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
||||
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
|
||||
.distinct
|
||||
.toList
|
||||
}
|
||||
|
||||
def resultToSummary(r: Result): ScholixSummary = {
|
||||
val s = new ScholixSummary
|
||||
s.setId(r.getId)
|
||||
if (r.getPid == null || r.getPid.isEmpty)
|
||||
return null
|
||||
|
||||
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r)
|
||||
if (persistentIdentifiers.isEmpty)
|
||||
return null
|
||||
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
||||
if (r.isInstanceOf[Publication])
|
||||
s.setTypology(Typology.publication)
|
||||
else
|
||||
s.setTypology(Typology.dataset)
|
||||
|
||||
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
||||
|
||||
if (r.getTitle != null && r.getTitle.asScala.nonEmpty) {
|
||||
val titles: List[String] = r.getTitle.asScala.map(t => t.getValue).toList
|
||||
if (titles.nonEmpty)
|
||||
s.setTitle(titles.asJava)
|
||||
else
|
||||
return null
|
||||
}
|
||||
|
||||
if (r.getAuthor != null && !r.getAuthor.isEmpty) {
|
||||
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList
|
||||
if (authors.nonEmpty)
|
||||
s.setAuthor(authors.asJava)
|
||||
}
|
||||
if (r.getInstance() != null) {
|
||||
val dt: List[String] = r
|
||||
.getInstance()
|
||||
.asScala
|
||||
.filter(i => i.getDateofacceptance != null)
|
||||
.map(i => i.getDateofacceptance.getValue)
|
||||
.toList
|
||||
if (dt.nonEmpty)
|
||||
s.setDate(dt.distinct.asJava)
|
||||
}
|
||||
if (r.getDescription != null && !r.getDescription.isEmpty) {
|
||||
val d = r.getDescription.asScala.find(f => f != null && f.getValue != null)
|
||||
if (d.isDefined)
|
||||
s.setDescription(d.get.getValue)
|
||||
}
|
||||
|
||||
if (r.getSubject != null && !r.getSubject.isEmpty) {
|
||||
val subjects: List[SchemeValue] = r.getSubject.asScala
|
||||
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
|
||||
.toList
|
||||
if (subjects.nonEmpty)
|
||||
s.setSubject(subjects.asJava)
|
||||
}
|
||||
|
||||
if (r.getPublisher != null)
|
||||
s.setPublisher(List(r.getPublisher.getValue).asJava)
|
||||
|
||||
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
|
||||
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
|
||||
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
|
||||
.toList
|
||||
if (cf.nonEmpty)
|
||||
s.setDatasources(cf.distinct.asJava)
|
||||
}
|
||||
|
||||
s.setRelatedDatasets(0)
|
||||
s.setRelatedPublications(0)
|
||||
s.setRelatedUnknown(0)
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
}
|
@ -1 +1,140 @@
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
|
||||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
|
||||
"resuttype": {"classid": "dataset"},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2011.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
|
||||
"value": "Repository B"
|
||||
}
|
||||
],
|
||||
"instance": [
|
||||
{
|
||||
"refereed": {
|
||||
"classid": "0000",
|
||||
"classname": "UNKNOWN",
|
||||
"schemeid": "dnet:review_levels",
|
||||
"schemename": "dnet:review_levels"
|
||||
},
|
||||
"hostedby": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"accessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"processingchargecurrency": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "EUR"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "Digital Object Identifier",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1371/journal.pone.0085605"
|
||||
}
|
||||
],
|
||||
"distributionlocation": "",
|
||||
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
|
||||
"alternateIdentifier": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "pmid",
|
||||
"classname": "PubMed ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "24454899.0"
|
||||
}
|
||||
],
|
||||
"collectedfrom": {
|
||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
|
||||
"value": "Repository B"
|
||||
},
|
||||
"processingchargeamount": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "1022.02"
|
||||
},
|
||||
"instancetype": {
|
||||
"classid": "0004",
|
||||
"classname": "Conference object",
|
||||
"schemeid": "dnet:publication_resource",
|
||||
"schemename": "dnet:publication_resource"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
@ -0,0 +1,140 @@
|
||||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
|
||||
"resuttype": {"classid": "dataset"},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2011.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
}
|
||||
],
|
||||
"instance": [
|
||||
{
|
||||
"refereed": {
|
||||
"classid": "0000",
|
||||
"classname": "UNKNOWN",
|
||||
"schemeid": "dnet:review_levels",
|
||||
"schemename": "dnet:review_levels"
|
||||
},
|
||||
"hostedby": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"accessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"processingchargecurrency": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "EUR"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "Digital Object Identifier",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1371/journal.pone.0085605"
|
||||
}
|
||||
],
|
||||
"distributionlocation": "",
|
||||
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
|
||||
"alternateIdentifier": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "pmid",
|
||||
"classname": "PubMed ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "24454899.0"
|
||||
}
|
||||
],
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"processingchargeamount": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "1022.02"
|
||||
},
|
||||
"instancetype": {
|
||||
"classid": "0004",
|
||||
"classname": "Conference object",
|
||||
"schemeid": "dnet:publication_resource",
|
||||
"schemename": "dnet:publication_resource"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
@ -1,86 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* Just collects all the atomic actions produced for the different results and saves them in
|
||||
* outputpath for the ActionSet
|
||||
*/
|
||||
public class CollectAndSave implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
CollectAndSave.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json")));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath {}: ", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
collectAndSave(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
sc
|
||||
.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
||||
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Subset of the information of the generic results that are needed to create the atomic action
|
||||
*/
|
||||
public class PreparedResult implements Serializable {
|
||||
private String id; // openaire id
|
||||
private String value; // doi
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
@ -1,77 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.GetCSV;
|
||||
|
||||
public class GetFOSData implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GetFOSData.class);
|
||||
|
||||
public static final char DEFAULT_DELIMITER = '\t';
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
GetFOSData.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json"))));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
// the path where the original fos csv file is stored
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}", sourcePath);
|
||||
|
||||
// the path where to put the file as json
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
log.info("hdfsNameNode {}", hdfsNameNode);
|
||||
|
||||
final String classForName = parser.get("classForName");
|
||||
log.info("classForName {}", classForName);
|
||||
|
||||
final char delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.map(s -> s.charAt(0))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
log.info("delimiter {}", delimiter);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
new GetFOSData().doRewrite(sourcePath, outputPath, classForName, delimiter, fileSystem);
|
||||
|
||||
}
|
||||
|
||||
public void doRewrite(String inputPath, String outputFile, String classForName, char delimiter, FileSystem fs)
|
||||
throws IOException, ClassNotFoundException {
|
||||
|
||||
// reads the csv and writes it as its json equivalent
|
||||
try (InputStreamReader reader = new InputStreamReader(fs.open(new Path(inputPath)))) {
|
||||
GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,91 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class GetFOSSparkJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GetFOSSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
GetFOSSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
// the path where the original fos csv file is stored
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}", sourcePath);
|
||||
|
||||
// the path where to put the file as json
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
getFOS(
|
||||
spark,
|
||||
sourcePath,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void getFOS(SparkSession spark, String sourcePath, String outputPath, String delimiter) {
|
||||
Dataset<Row> fosData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(sourcePath);
|
||||
|
||||
fosData.map((MapFunction<Row, FOSDataModel>) r -> {
|
||||
FOSDataModel fosDataModel = new FOSDataModel();
|
||||
fosDataModel.setDoi(r.getString(0).toLowerCase());
|
||||
fosDataModel.setLevel1(r.getString(1));
|
||||
fosDataModel.setLevel2(r.getString(2));
|
||||
fosDataModel.setLevel3(r.getString(3));
|
||||
return fosDataModel;
|
||||
}, Encoders.bean(FOSDataModel.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,91 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class GetSDGSparkJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GetSDGSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
GetSDGSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
// the path where the original fos csv file is stored
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}", sourcePath);
|
||||
|
||||
// the path where to put the file as json
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
getSDG(
|
||||
spark,
|
||||
sourcePath,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void getSDG(SparkSession spark, String sourcePath, String outputPath, String delimiter) {
|
||||
Dataset<Row> sdgData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(sourcePath);
|
||||
|
||||
sdgData.map((MapFunction<Row, SDGDataModel>) r -> {
|
||||
SDGDataModel sdgDataModel = new SDGDataModel();
|
||||
sdgDataModel.setDoi(r.getString(0).toLowerCase());
|
||||
sdgDataModel.setSbj(r.getString(1));
|
||||
|
||||
return sdgDataModel;
|
||||
}, Encoders.bean(SDGDataModel.class))
|
||||
.filter((FilterFunction<SDGDataModel>) sdg -> sdg.getSbj() != null)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,104 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class PrepareSDGSparkJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareSDGSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareSDGSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath: {}", sourcePath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
doPrepare(
|
||||
spark,
|
||||
sourcePath,
|
||||
|
||||
outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) {
|
||||
Dataset<SDGDataModel> sdgDataset = readPath(spark, sourcePath, SDGDataModel.class);
|
||||
|
||||
sdgDataset
|
||||
.groupByKey((MapFunction<SDGDataModel, String>) r -> r.getDoi().toLowerCase(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, SDGDataModel, Result>) (k, it) -> {
|
||||
Result r = new Result();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
|
||||
SDGDataModel first = it.next();
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
|
||||
it
|
||||
.forEachRemaining(
|
||||
s -> sbjs
|
||||
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
|
||||
r.setSubject(sbjs);
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/sdg");
|
||||
}
|
||||
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Class that maps the model of the bipFinder! input data.
|
||||
* Only needed for deserialization purposes
|
||||
*/
|
||||
|
||||
public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
|
||||
|
||||
public BipDeserialize() {
|
||||
super();
|
||||
}
|
||||
|
||||
public List<Score> get(String key) {
|
||||
|
||||
if (super.get(key) == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
return super.get(key);
|
||||
}
|
||||
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
|
||||
*/
|
||||
|
||||
public class BipScore implements Serializable {
|
||||
private String id; // doi
|
||||
private List<Score> scoreList; // unit as given in the inputfile
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<Score> getScoreList() {
|
||||
return scoreList;
|
||||
}
|
||||
|
||||
public void setScoreList(List<Score> scoreList) {
|
||||
this.scoreList = scoreList;
|
||||
}
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class KeyValue implements Serializable {
|
||||
|
||||
private String key;
|
||||
private String value;
|
||||
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
public void setKey(String key) {
|
||||
this.key = key;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
@ -0,0 +1,47 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByPosition;
|
||||
|
||||
public class SDGDataModel implements Serializable {
|
||||
|
||||
@CsvBindByPosition(position = 0)
|
||||
// @CsvBindByName(column = "doi")
|
||||
private String doi;
|
||||
|
||||
@CsvBindByPosition(position = 1)
|
||||
// @CsvBindByName(column = "sdg")
|
||||
private String sbj;
|
||||
|
||||
public SDGDataModel() {
|
||||
|
||||
}
|
||||
|
||||
public SDGDataModel(String doi, String sbj) {
|
||||
this.doi = doi;
|
||||
this.sbj = sbj;
|
||||
|
||||
}
|
||||
|
||||
public static SDGDataModel newInstance(String d, String sbj) {
|
||||
return new SDGDataModel(d, sbj);
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getSbj() {
|
||||
return sbj;
|
||||
}
|
||||
|
||||
public void setSbj(String sbj) {
|
||||
this.sbj = sbj;
|
||||
}
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* represents the score in the input file
|
||||
*/
|
||||
public class Score implements Serializable {
|
||||
|
||||
private String id;
|
||||
private List<KeyValue> unit;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<KeyValue> getUnit() {
|
||||
return unit;
|
||||
}
|
||||
|
||||
public void setUnit(List<KeyValue> unit) {
|
||||
this.unit = unit;
|
||||
}
|
||||
}
|
@ -0,0 +1,103 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class ReadCOCI implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ReadCOCI.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
ReadCOCI.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String[] inputFile = parser.get("inputFile").split(";");
|
||||
log.info("inputFile {}", inputFile.toString());
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath {}", workingPath);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
doRead(
|
||||
spark,
|
||||
workingPath,
|
||||
inputFile,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
|
||||
String outputPath,
|
||||
String delimiter) throws IOException {
|
||||
|
||||
for (String inputFile : inputFiles) {
|
||||
String p_string = workingPath + "/" + inputFile + ".gz";
|
||||
|
||||
Dataset<Row> cociData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(p_string)
|
||||
.repartition(100);
|
||||
|
||||
cociData.map((MapFunction<Row, COCI>) row -> {
|
||||
COCI coci = new COCI();
|
||||
coci.setOci(row.getString(0));
|
||||
coci.setCiting(row.getString(1));
|
||||
coci.setCited(row.getString(2));
|
||||
return coci;
|
||||
}, Encoders.bean(COCI.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + inputFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByPosition;
|
||||
|
||||
public class COCI implements Serializable {
|
||||
private String oci;
|
||||
|
||||
private String citing;
|
||||
|
||||
private String cited;
|
||||
|
||||
public String getOci() {
|
||||
return oci;
|
||||
}
|
||||
|
||||
public void setOci(String oci) {
|
||||
this.oci = oci;
|
||||
}
|
||||
|
||||
public String getCiting() {
|
||||
return citing;
|
||||
}
|
||||
|
||||
public void setCiting(String citing) {
|
||||
this.citing = citing;
|
||||
}
|
||||
|
||||
public String getCited() {
|
||||
return cited;
|
||||
}
|
||||
|
||||
public void setCited(String cited) {
|
||||
this.cited = cited;
|
||||
}
|
||||
|
||||
}
|
@ -1,69 +0,0 @@
|
||||
package eu.dnetlib.dhp.actionmanager.scholix
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkCreateActionset {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
val workingDirFolder = parser.get("workingDirFolder")
|
||||
log.info(s"workingDirFolder -> $workingDirFolder")
|
||||
|
||||
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
|
||||
implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
|
||||
|
||||
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
.flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
|
||||
|
||||
|
||||
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
|
||||
|
||||
log.info("extract source and target Identifier involved in relations")
|
||||
|
||||
|
||||
log.info("save relation filtered")
|
||||
|
||||
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
|
||||
|
||||
log.info("saving entities")
|
||||
|
||||
val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
|
||||
|
||||
entities
|
||||
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
|
||||
.map(p => p._1._2)
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
||||
}
|
||||
|
||||
}
|
@ -1,86 +0,0 @@
|
||||
package eu.dnetlib.dhp.actionmanager.scholix
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkSaveActionSet {
|
||||
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/save_actionset.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
|
||||
spark.read.load(sourcePath).as[Oaf]
|
||||
.map(o => toActionSet(o))
|
||||
.filter(o => o != null)
|
||||
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,134 +0,0 @@
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||
|
||||
import java.io.InputStream
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
import scala.io.Source
|
||||
|
||||
/**
|
||||
* This class represent the dataModel of the input Dataset of Datacite
|
||||
* @param doi THE DOI
|
||||
* @param timestamp timestamp of last update date
|
||||
* @param isActive the record is active or deleted
|
||||
* @param json the json native records
|
||||
*/
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
/*
|
||||
The following class are utility class used for the mapping from
|
||||
json datacite to OAF Shema
|
||||
*/
|
||||
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
|
||||
|
||||
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
|
||||
|
||||
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class OAFRelations(relation:String, inverse:String, relType:String)
|
||||
|
||||
|
||||
class DataciteModelConstants extends Serializable {
|
||||
|
||||
}
|
||||
|
||||
object DataciteModelConstants {
|
||||
|
||||
val REL_TYPE_VALUE:String = "resultResult"
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
val DATACITE_NAME = "Datacite"
|
||||
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
||||
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
||||
|
||||
val subRelTypeMapping: Map[String,OAFRelations] = Map(
|
||||
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
|
||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
|
||||
|
||||
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
|
||||
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
|
||||
|
||||
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
|
||||
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
|
||||
|
||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
|
||||
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
|
||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
|
||||
|
||||
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
|
||||
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
|
||||
|
||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
|
||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
|
||||
|
||||
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
|
||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
|
||||
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
|
||||
|
||||
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
|
||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
|
||||
)
|
||||
|
||||
|
||||
val datacite_filter: List[String] = {
|
||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
||||
require(stream!= null)
|
||||
Source.fromInputStream(stream).getLines().toList
|
||||
}
|
||||
|
||||
|
||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
|
||||
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
|
||||
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||
//M-D-Y
|
||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
||||
//D-M-Y
|
||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
|
||||
}
|
@ -1,394 +0,0 @@
|
||||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import collection.JavaConverters._
|
||||
object BioDBToOAF {
|
||||
|
||||
case class EBILinkItem(id: Long, links: String) {}
|
||||
|
||||
case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
|
||||
|
||||
case class UniprotDate(date: String, date_info: String) {}
|
||||
|
||||
case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
|
||||
|
||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
|
||||
val SUBJ_CLASS = "Keywords"
|
||||
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
|
||||
val resolvedURL: Map[String, String] = Map(
|
||||
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
|
||||
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
|
||||
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
|
||||
"onim" -> "https://omim.org/entry/",
|
||||
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
||||
)
|
||||
|
||||
|
||||
val collectedFromMap: Map[String, KeyValue] = {
|
||||
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
|
||||
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
|
||||
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
|
||||
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
|
||||
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
||||
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
|
||||
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
|
||||
val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
||||
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ElsevierCollectedFrom.setDataInfo(DATA_INFO)
|
||||
EBICollectedFrom.setDataInfo(DATA_INFO)
|
||||
pubmedCollectedFrom.setDataInfo(DATA_INFO)
|
||||
enaCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ncbiCollectedFrom.setDataInfo(DATA_INFO)
|
||||
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
|
||||
|
||||
Map(
|
||||
"uniprot" -> UNIPROTCollectedFrom,
|
||||
"pdb" -> PDBCollectedFrom,
|
||||
"elsevier" -> ElsevierCollectedFrom,
|
||||
"ebi" -> EBICollectedFrom,
|
||||
"Springer Nature" -> springerNatureCollectedFrom,
|
||||
"NCBI Nucleotide" -> ncbiCollectedFrom,
|
||||
"European Nucleotide Archive" -> enaCollectedFrom,
|
||||
"Europe PMC" -> pubmedCollectedFrom
|
||||
)
|
||||
}
|
||||
|
||||
def crossrefLinksToOaf(input: String): Oaf = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
|
||||
|
||||
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
||||
|
||||
createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
|
||||
val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
||||
|
||||
if (input.tilte != null && input.tilte.nonEmpty)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
|
||||
d.setOriginalId(List(input.pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
|
||||
if (resolvedURL.contains(input.pidType)) {
|
||||
i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
|
||||
}
|
||||
|
||||
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
else
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
if (input.datasource == null || input.datasource.isEmpty)
|
||||
return null
|
||||
|
||||
val ds = input.datasource.head
|
||||
d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
|
||||
i.setCollectedfrom(collectedFromMap(ds))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
if (input.authors != null && input.authors.nonEmpty) {
|
||||
val authors = input.authors.map(a => {
|
||||
val authorOAF = new Author
|
||||
authorOAF.setFullname(a)
|
||||
authorOAF
|
||||
})
|
||||
d.setAuthor(authors.asJava)
|
||||
}
|
||||
if (input.date != null && input.date.nonEmpty) {
|
||||
val dt = input.date.head
|
||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
|
||||
def uniprotToOAF(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pid = (json \ "pid").extract[String]
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
|
||||
d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
|
||||
|
||||
val title: String = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title != null)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
|
||||
d.setOriginalId(List(pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("uniprot"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val dates: List[UniprotDate] = for {
|
||||
JObject(dateOBJ) <- json \ "dates"
|
||||
JField("date", JString(date)) <- dateOBJ
|
||||
JField("date_info", JString(date_info)) <- dateOBJ
|
||||
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
|
||||
|
||||
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
||||
|
||||
|
||||
if (subjects != null) {
|
||||
d.setSubject(
|
||||
subjects.map(s =>
|
||||
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
||||
).asJava)
|
||||
}
|
||||
var i_date: Option[UniprotDate] = None
|
||||
|
||||
if (dates.nonEmpty) {
|
||||
i_date = dates.find(d => d.date_info.contains("entry version"))
|
||||
if (i_date.isDefined) {
|
||||
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
|
||||
.map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
|
||||
if (relevant_dates != null && relevant_dates.nonEmpty)
|
||||
d.setRelevantdate(relevant_dates.asJava)
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
|
||||
|
||||
val references_pmid: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField("PubMed", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
val references_doi: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField(" DOI", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
|
||||
if (references_pmid != null && references_pmid.nonEmpty) {
|
||||
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
|
||||
rel.getCollectedfrom
|
||||
List(d, rel)
|
||||
}
|
||||
else if (references_doi != null && references_doi.nonEmpty) {
|
||||
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
|
||||
List(d, rel)
|
||||
}
|
||||
else
|
||||
List(d)
|
||||
}
|
||||
|
||||
|
||||
def generate_unresolved_id(pid: String, pidType: String): String = {
|
||||
s"unresolved::$pid::$pidType"
|
||||
}
|
||||
|
||||
|
||||
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
|
||||
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
rel.setDataInfo(DATA_INFO)
|
||||
|
||||
rel.setRelType(ModelConstants.RESULT_RESULT)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(relClass)
|
||||
|
||||
rel.setSource(sourceId)
|
||||
rel.setTarget(s"unresolved::$pid::$pidType")
|
||||
|
||||
|
||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.getTarget.startsWith("unresolved")
|
||||
rel.setCollectedfrom(List(collectedFrom).asJava)
|
||||
rel
|
||||
|
||||
}
|
||||
|
||||
|
||||
def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
|
||||
createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
|
||||
}
|
||||
|
||||
|
||||
def pdbTOOaf(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pdb = (json \ "pdb").extract[String].toLowerCase
|
||||
|
||||
if (pdb.isEmpty)
|
||||
return List()
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
|
||||
d.setOriginalId(List(pdb).asJava)
|
||||
|
||||
val title = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title == null)
|
||||
return List()
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
|
||||
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
||||
|
||||
if (authors != null) {
|
||||
val convertedAuthors = authors.zipWithIndex.map { a =>
|
||||
|
||||
val res = new Author
|
||||
res.setFullname(a._1)
|
||||
res.setRank(a._2 + 1)
|
||||
res
|
||||
}
|
||||
|
||||
d.setAuthor(convertedAuthors.asJava)
|
||||
}
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("pdb"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val pmid = (json \ "pmid").extractOrElse[String](null)
|
||||
|
||||
if (pmid != null)
|
||||
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null))
|
||||
else
|
||||
List(d)
|
||||
}
|
||||
|
||||
|
||||
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val pmid = (json \ "publication" \ "pmid").extract[String]
|
||||
val links = (json \ "links").extract[JObject]
|
||||
EBILinkItem(pmid.toLong, compact(render(links)))
|
||||
}
|
||||
|
||||
|
||||
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
||||
|
||||
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
|
||||
|
||||
}
|
||||
|
||||
|
||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "request" \ "id").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("RelationshipType", JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate", JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("IDURL", JString(idUrl)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl)
|
||||
}
|
||||
|
||||
|
||||
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
||||
val d = new Dataset
|
||||
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
|
||||
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
||||
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
||||
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
||||
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(input.targetUrl).asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("ebi"))
|
||||
d.setInstance(List(i).asJava)
|
||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
|
||||
|
||||
List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date)))
|
||||
}
|
||||
}
|
@ -1,146 +0,0 @@
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import scala.xml.MetaData
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param xml
|
||||
*/
|
||||
class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||
|
||||
var currentArticle:PMArticle = generateNextArticle()
|
||||
|
||||
override def hasNext: Boolean = currentArticle!= null
|
||||
|
||||
override def next(): PMArticle = {
|
||||
val tmp = currentArticle
|
||||
currentArticle = generateNextArticle()
|
||||
tmp
|
||||
}
|
||||
|
||||
def extractAttributes(attrs:MetaData, key:String):String = {
|
||||
|
||||
val res = attrs.get(key)
|
||||
if (res.isDefined) {
|
||||
val s =res.get
|
||||
if (s != null && s.nonEmpty)
|
||||
s.head.text
|
||||
else
|
||||
null
|
||||
}
|
||||
else null
|
||||
}
|
||||
|
||||
|
||||
def validate_Date(year:String, month:String, day:String):String = {
|
||||
try {
|
||||
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
|
||||
|
||||
} catch {
|
||||
case _: Throwable =>null
|
||||
}
|
||||
}
|
||||
|
||||
def generateNextArticle():PMArticle = {
|
||||
|
||||
|
||||
var currentSubject:PMSubject = null
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currentGrant: PMGrant = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType:String = null
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
while (xml.hasNext) {
|
||||
xml.next match {
|
||||
case EvElemStart(_, label, attrs, _) =>
|
||||
currNode = label
|
||||
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "PublicationType" | "DescriptorName" =>
|
||||
currentSubject = new PMSubject
|
||||
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs,"IdType")
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" => if (currentArticle.getDate== null)
|
||||
currentArticle.setDate(validate_Date(currentYear,currentMonth,currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" =>currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode!= null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle==null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription==null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume( text.trim)
|
||||
case "Issue" => currentJournal.setIssue (text.trim)
|
||||
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
}
|
||||
case "ForeName" => if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle==null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,32 +0,0 @@
|
||||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rtn",
|
||||
"paramLongName": "resultTableName",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bsp",
|
||||
"paramLongName": "bipScorePath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -0,0 +1,37 @@
|
||||
[
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the zipped opencitations file",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "delimiter",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "if",
|
||||
"paramLongName": "inputFile",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
@ -0,0 +1,63 @@
|
||||
from urllib.request import urlopen
|
||||
import json
|
||||
|
||||
|
||||
def retrieve_datacite_clients(base_url):
|
||||
datacite_clients = {}
|
||||
while base_url is not None:
|
||||
with urlopen(base_url) as response:
|
||||
print(f"requesting {base_url}")
|
||||
response_content = response.read()
|
||||
data = json.loads(response_content)
|
||||
if 'data' in data and len(data['data'])>0:
|
||||
for item in data['data']:
|
||||
datacite_clients[item['id'].lower()]= item['attributes']['re3data'].lower().replace("https://doi.org/","")
|
||||
base_url = data['links']['next']
|
||||
else:
|
||||
base_url = None
|
||||
return datacite_clients
|
||||
|
||||
|
||||
def retrieve_r3data(start_url):
|
||||
r3data_clients = {}
|
||||
page_number = 1
|
||||
base_url = start_url
|
||||
while base_url is not None:
|
||||
with urlopen(base_url) as response:
|
||||
print(f"requesting {base_url}")
|
||||
response_content = response.read()
|
||||
data = json.loads(response_content)
|
||||
if 'data' in data and len(data['data'])>0:
|
||||
for item in data['data']:
|
||||
r3data_clients[item['id'].lower()]= dict(
|
||||
openaire_id= "re3data_____::"+item['attributes']['re3dataId'].lower(),
|
||||
official_name=item['attributes']['repositoryName']
|
||||
)
|
||||
page_number +=1
|
||||
base_url = f"{start_url}&page[number]={page_number}"
|
||||
else:
|
||||
base_url = None
|
||||
return r3data_clients
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
base_url ="https://api.datacite.org/clients?query=re3data_id:*&page[size]=250"
|
||||
|
||||
dc = retrieve_datacite_clients(base_url)
|
||||
r3 = retrieve_r3data("https://api.datacite.org/re3data?page[size]=250")
|
||||
|
||||
result = {}
|
||||
|
||||
for item in dc:
|
||||
res = dc[item].lower()
|
||||
if res not in r3:
|
||||
print(f"missing {res} for {item} in dictionary")
|
||||
else:
|
||||
result[item.upper()]= dict(openaire_id=r3[res]["openaire_id"],datacite_name=r3[res]["official_name"], official_name=r3[res]["official_name"] )
|
||||
|
||||
|
||||
with open('hostedBy_map.json', 'w', encoding='utf8') as json_file:
|
||||
json.dump(result, json_file, ensure_ascii=False, indent=1)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,19 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
@ -0,0 +1,62 @@
|
||||
<workflow-app name="Retrieve Scholix Update" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path of scholix graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datacitePath</name>
|
||||
<description>the datacite native path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingSupportPath</name>
|
||||
<description>the working Support path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>updateDS</name>
|
||||
<value>false</value>
|
||||
<description>The transformation Rule to apply</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="RetrieveDeltaDatacite"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="RetrieveDeltaDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>New Update from Datacite to Scholix</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkRetrieveDataciteDelta</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=6000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--datacitePath</arg><arg>${datacitePath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--workingSupportPath</arg><arg>${workingSupportPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--updateDS</arg><arg>${updateDS}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
@ -0,0 +1,41 @@
|
||||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source mdstore path",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "datacitePath",
|
||||
"paramDescription": "the datacite native path",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingSupportPath",
|
||||
"paramDescription": "the working Support path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "the isLookup URL",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "updateDS",
|
||||
"paramDescription": "Need to regenerate all support Dataset",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
@ -0,0 +1,278 @@
|
||||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
|
||||
|
||||
import java.io.InputStream
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
import scala.io.Source
|
||||
|
||||
/** This class represent the dataModel of the input Dataset of Datacite
|
||||
* @param doi THE DOI
|
||||
* @param timestamp timestamp of last update date
|
||||
* @param isActive the record is active or deleted
|
||||
* @param json the json native records
|
||||
*/
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
/*
|
||||
The following class are utility class used for the mapping from
|
||||
json datacite to OAF Shema
|
||||
*/
|
||||
case class RelatedIdentifierType(
|
||||
relationType: String,
|
||||
relatedIdentifier: String,
|
||||
relatedIdentifierType: String
|
||||
) {}
|
||||
|
||||
case class NameIdentifiersType(
|
||||
nameIdentifierScheme: Option[String],
|
||||
schemeUri: Option[String],
|
||||
nameIdentifier: Option[String]
|
||||
) {}
|
||||
|
||||
case class CreatorType(
|
||||
nameType: Option[String],
|
||||
nameIdentifiers: Option[List[NameIdentifiersType]],
|
||||
name: Option[String],
|
||||
familyName: Option[String],
|
||||
givenName: Option[String],
|
||||
affiliation: Option[List[String]]
|
||||
) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(
|
||||
funderIdentifierType: Option[String],
|
||||
awardTitle: Option[String],
|
||||
awardUri: Option[String],
|
||||
funderName: Option[String],
|
||||
funderIdentifier: Option[String],
|
||||
awardNumber: Option[String]
|
||||
) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class OAFRelations(relation: String, inverse: String, relType: String)
|
||||
|
||||
class DataciteModelConstants extends Serializable {}
|
||||
|
||||
object DataciteModelConstants {
|
||||
|
||||
val REL_TYPE_VALUE: String = "resultResult"
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
val DATACITE_NAME = "Datacite"
|
||||
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
||||
|
||||
val DATACITE_COLLECTED_FROM: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
||||
|
||||
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
||||
ModelConstants.REFERENCES -> OAFRelations(
|
||||
ModelConstants.REFERENCES,
|
||||
ModelConstants.IS_REFERENCED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
|
||||
ModelConstants.IS_REFERENCED_BY,
|
||||
ModelConstants.REFERENCES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.SUPPLEMENT
|
||||
),
|
||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||
ModelConstants.SUPPLEMENT
|
||||
),
|
||||
ModelConstants.HAS_PART -> OAFRelations(
|
||||
ModelConstants.HAS_PART,
|
||||
ModelConstants.IS_PART_OF,
|
||||
ModelConstants.PART
|
||||
),
|
||||
ModelConstants.IS_PART_OF -> OAFRelations(
|
||||
ModelConstants.IS_PART_OF,
|
||||
ModelConstants.HAS_PART,
|
||||
ModelConstants.PART
|
||||
),
|
||||
ModelConstants.IS_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_VERSION_OF,
|
||||
ModelConstants.HAS_VERSION,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.HAS_VERSION -> OAFRelations(
|
||||
ModelConstants.HAS_VERSION,
|
||||
ModelConstants.IS_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
|
||||
ModelConstants.IS_IDENTICAL_TO,
|
||||
ModelConstants.IS_IDENTICAL_TO,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
|
||||
ModelConstants.IS_CONTINUED_BY,
|
||||
ModelConstants.CONTINUES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.CONTINUES -> OAFRelations(
|
||||
ModelConstants.CONTINUES,
|
||||
ModelConstants.IS_CONTINUED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(
|
||||
ModelConstants.IS_SOURCE_OF,
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.IS_SOURCE_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.CITES -> OAFRelations(
|
||||
ModelConstants.CITES,
|
||||
ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.CITATION
|
||||
),
|
||||
ModelConstants.IS_CITED_BY -> OAFRelations(
|
||||
ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.CITES,
|
||||
ModelConstants.CITATION
|
||||
),
|
||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
|
||||
ModelConstants.IS_VARIANT_FORM_OF,
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
|
||||
ModelConstants.IS_OBSOLETED_BY,
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.REVIEWS -> OAFRelations(
|
||||
ModelConstants.REVIEWS,
|
||||
ModelConstants.IS_REVIEWED_BY,
|
||||
ModelConstants.REVIEW
|
||||
),
|
||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
|
||||
ModelConstants.IS_REVIEWED_BY,
|
||||
ModelConstants.REVIEWS,
|
||||
ModelConstants.REVIEW
|
||||
),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.COMPILES -> OAFRelations(
|
||||
ModelConstants.COMPILES,
|
||||
ModelConstants.IS_COMPILED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(
|
||||
ModelConstants.IS_COMPILED_BY,
|
||||
ModelConstants.COMPILES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
)
|
||||
)
|
||||
|
||||
val datacite_filter: List[String] = {
|
||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
||||
require(stream != null)
|
||||
Source.fromInputStream(stream).getLines().toList
|
||||
}
|
||||
|
||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
trust
|
||||
)
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
|
||||
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
||||
Locale.ENGLISH
|
||||
)
|
||||
|
||||
val df_it: DateTimeFormatter =
|
||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda__h2020::"
|
||||
),
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda_______::"
|
||||
)
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile(
|
||||
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//M-D-Y
|
||||
Pattern.compile(
|
||||
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//D-M-Y
|
||||
Pattern.compile(
|
||||
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
}
|
@ -0,0 +1,597 @@
|
||||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import collection.JavaConverters._
|
||||
|
||||
object BioDBToOAF {
|
||||
|
||||
case class EBILinkItem(id: Long, links: String) {}
|
||||
|
||||
case class EBILinks(
|
||||
relType: String,
|
||||
date: String,
|
||||
title: String,
|
||||
pmid: String,
|
||||
targetPid: String,
|
||||
targetPidType: String,
|
||||
targetUrl: String
|
||||
) {}
|
||||
|
||||
case class UniprotDate(date: String, date_info: String) {}
|
||||
|
||||
case class ScholixResolved(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
typology: String,
|
||||
tilte: List[String],
|
||||
datasource: List[String],
|
||||
date: List[String],
|
||||
authors: List[String]
|
||||
) {}
|
||||
|
||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
"0.9"
|
||||
)
|
||||
val SUBJ_CLASS = "Keywords"
|
||||
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
|
||||
val resolvedURL: Map[String, String] = Map(
|
||||
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
|
||||
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
|
||||
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
|
||||
"onim" -> "https://omim.org/entry/",
|
||||
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
||||
)
|
||||
|
||||
val collectedFromMap: Map[String, KeyValue] = {
|
||||
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
|
||||
"Protein Data Bank"
|
||||
)
|
||||
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::c2a591f440598b63d854556beaf01591",
|
||||
"European Nucleotide Archive"
|
||||
)
|
||||
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
|
||||
"NCBI Nucleotide"
|
||||
)
|
||||
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::296e1abaf1302897a6838d3588cd0310",
|
||||
"UniProtKB/Swiss-Prot"
|
||||
)
|
||||
val ElsevierCollectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
||||
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
|
||||
"Springer Nature"
|
||||
)
|
||||
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|opendoar____::83e60e09c222f206c725385f53d7e567c",
|
||||
"EMBL-EBIs Protein Data Bank in Europe (PDBe)"
|
||||
)
|
||||
val pubmedCollectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
||||
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ElsevierCollectedFrom.setDataInfo(DATA_INFO)
|
||||
EBICollectedFrom.setDataInfo(DATA_INFO)
|
||||
pubmedCollectedFrom.setDataInfo(DATA_INFO)
|
||||
enaCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ncbiCollectedFrom.setDataInfo(DATA_INFO)
|
||||
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
|
||||
|
||||
Map(
|
||||
"uniprot" -> UNIPROTCollectedFrom,
|
||||
"pdb" -> PDBCollectedFrom,
|
||||
"elsevier" -> ElsevierCollectedFrom,
|
||||
"ebi" -> EBICollectedFrom,
|
||||
"Springer Nature" -> springerNatureCollectedFrom,
|
||||
"NCBI Nucleotide" -> ncbiCollectedFrom,
|
||||
"European Nucleotide Archive" -> enaCollectedFrom,
|
||||
"Europe PMC" -> pubmedCollectedFrom
|
||||
)
|
||||
}
|
||||
|
||||
def crossrefLinksToOaf(input: String): Oaf = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
|
||||
|
||||
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
||||
|
||||
createRelation(
|
||||
target_pid,
|
||||
target_pid_type,
|
||||
generate_unresolved_id(source_pid, source_pid_type),
|
||||
collectedFromMap("elsevier"),
|
||||
"relationship",
|
||||
relation_semantic,
|
||||
date
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.pid.toLowerCase,
|
||||
input.pidType.toLowerCase,
|
||||
input.pidType.toLowerCase,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
|
||||
val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
||||
|
||||
if (input.tilte != null && input.tilte.nonEmpty)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.tilte.head,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setOriginalId(List(input.pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
|
||||
if (resolvedURL.contains(input.pidType)) {
|
||||
i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
|
||||
}
|
||||
|
||||
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0037",
|
||||
"Clinical Trial",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
else
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
if (input.datasource == null || input.datasource.isEmpty)
|
||||
return null
|
||||
|
||||
val ds = input.datasource.head
|
||||
d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
|
||||
i.setCollectedfrom(collectedFromMap(ds))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
if (input.authors != null && input.authors.nonEmpty) {
|
||||
val authors = input.authors.map(a => {
|
||||
val authorOAF = new Author
|
||||
authorOAF.setFullname(a)
|
||||
authorOAF
|
||||
})
|
||||
d.setAuthor(authors.asJava)
|
||||
}
|
||||
if (input.date != null && input.date.nonEmpty) {
|
||||
val dt = input.date.head
|
||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
def uniprotToOAF(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pid = (json \ "pid").extract[String]
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
pid,
|
||||
"uniprot",
|
||||
"uniprot",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
|
||||
d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
|
||||
|
||||
val title: String = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title != null)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setOriginalId(List(pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("uniprot"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val dates: List[UniprotDate] = for {
|
||||
JObject(dateOBJ) <- json \ "dates"
|
||||
JField("date", JString(date)) <- dateOBJ
|
||||
JField("date_info", JString(date_info)) <- dateOBJ
|
||||
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
|
||||
|
||||
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
||||
|
||||
if (subjects != null) {
|
||||
d.setSubject(
|
||||
subjects
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
null
|
||||
)
|
||||
)
|
||||
.asJava
|
||||
)
|
||||
}
|
||||
var i_date: Option[UniprotDate] = None
|
||||
|
||||
if (dates.nonEmpty) {
|
||||
i_date = dates.find(d => d.date_info.contains("entry version"))
|
||||
if (i_date.isDefined) {
|
||||
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
val relevant_dates: List[StructuredProperty] = dates
|
||||
.filter(d => !d.date_info.contains("entry version"))
|
||||
.map(date =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
date.date,
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
DATA_INFO
|
||||
)
|
||||
)
|
||||
if (relevant_dates != null && relevant_dates.nonEmpty)
|
||||
d.setRelevantdate(relevant_dates.asJava)
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
|
||||
val references_pmid: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField("PubMed", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
val references_doi: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField(" DOI", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
if (references_pmid != null && references_pmid.nonEmpty) {
|
||||
val rel = createRelation(
|
||||
references_pmid.head,
|
||||
"pmid",
|
||||
d.getId,
|
||||
collectedFromMap("uniprot"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
if (i_date.isDefined) i_date.get.date else null
|
||||
)
|
||||
rel.getCollectedfrom
|
||||
List(d, rel)
|
||||
} else if (references_doi != null && references_doi.nonEmpty) {
|
||||
val rel = createRelation(
|
||||
references_doi.head,
|
||||
"doi",
|
||||
d.getId,
|
||||
collectedFromMap("uniprot"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
if (i_date.isDefined) i_date.get.date else null
|
||||
)
|
||||
List(d, rel)
|
||||
} else
|
||||
List(d)
|
||||
}
|
||||
|
||||
def generate_unresolved_id(pid: String, pidType: String): String = {
|
||||
s"unresolved::$pid::$pidType"
|
||||
}
|
||||
|
||||
def createRelation(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
sourceId: String,
|
||||
collectedFrom: KeyValue,
|
||||
subRelType: String,
|
||||
relClass: String,
|
||||
date: String
|
||||
): Relation = {
|
||||
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
rel.setDataInfo(DATA_INFO)
|
||||
|
||||
rel.setRelType(ModelConstants.RESULT_RESULT)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(relClass)
|
||||
|
||||
rel.setSource(sourceId)
|
||||
rel.setTarget(s"unresolved::$pid::$pidType")
|
||||
|
||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.getTarget.startsWith("unresolved")
|
||||
rel.setCollectedfrom(List(collectedFrom).asJava)
|
||||
rel
|
||||
|
||||
}
|
||||
|
||||
def createSupplementaryRelation(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
sourceId: String,
|
||||
collectedFrom: KeyValue,
|
||||
date: String
|
||||
): Relation = {
|
||||
createRelation(
|
||||
pid,
|
||||
pidType,
|
||||
sourceId,
|
||||
collectedFrom,
|
||||
ModelConstants.SUPPLEMENT,
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
date
|
||||
)
|
||||
}
|
||||
|
||||
def pdbTOOaf(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pdb = (json \ "pdb").extract[String].toLowerCase
|
||||
|
||||
if (pdb.isEmpty)
|
||||
return List()
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
pdb,
|
||||
"pdb",
|
||||
"Protein Data Bank Identifier",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
|
||||
d.setOriginalId(List(pdb).asJava)
|
||||
|
||||
val title = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title == null)
|
||||
return List()
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
||||
|
||||
if (authors != null) {
|
||||
val convertedAuthors = authors.zipWithIndex.map { a =>
|
||||
val res = new Author
|
||||
res.setFullname(a._1)
|
||||
res.setRank(a._2 + 1)
|
||||
res
|
||||
}
|
||||
|
||||
d.setAuthor(convertedAuthors.asJava)
|
||||
}
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("pdb"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val pmid = (json \ "pmid").extractOrElse[String](null)
|
||||
|
||||
if (pmid != null)
|
||||
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null))
|
||||
else
|
||||
List(d)
|
||||
}
|
||||
|
||||
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val pmid = (json \ "publication" \ "pmid").extract[String]
|
||||
val links = (json \ "links").extract[JObject]
|
||||
EBILinkItem(pmid.toLong, compact(render(links)))
|
||||
}
|
||||
|
||||
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
||||
|
||||
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
|
||||
"pdb"
|
||||
) || input.targetPidType.equalsIgnoreCase("uniprot")
|
||||
|
||||
}
|
||||
|
||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "request" \ "id").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("RelationshipType", JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate", JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("IDURL", JString(idUrl)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(
|
||||
relation,
|
||||
GraphCleaningFunctions.cleanDate(publicationDate),
|
||||
title,
|
||||
pmid,
|
||||
id,
|
||||
idScheme,
|
||||
idUrl
|
||||
)
|
||||
}
|
||||
|
||||
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
||||
val d = new Dataset
|
||||
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.title,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
||||
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
||||
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.targetPid.toLowerCase,
|
||||
input.targetPidType.toLowerCase,
|
||||
"Protein Data Bank Identifier",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(input.targetUrl).asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("ebi"))
|
||||
d.setInstance(List(i).asJava)
|
||||
i.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||
)
|
||||
d.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||
)
|
||||
|
||||
List(
|
||||
d,
|
||||
createRelation(
|
||||
input.pmid,
|
||||
"pmid",
|
||||
d.getId,
|
||||
collectedFromMap("ebi"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
GraphCleaningFunctions.cleanDate(input.date)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
@ -0,0 +1,133 @@
|
||||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import scala.xml.MetaData
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||
|
||||
/** @param xml
|
||||
*/
|
||||
class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
|
||||
|
||||
var currentArticle: PMArticle = generateNextArticle()
|
||||
|
||||
override def hasNext: Boolean = currentArticle != null
|
||||
|
||||
override def next(): PMArticle = {
|
||||
val tmp = currentArticle
|
||||
currentArticle = generateNextArticle()
|
||||
tmp
|
||||
}
|
||||
|
||||
def extractAttributes(attrs: MetaData, key: String): String = {
|
||||
|
||||
val res = attrs.get(key)
|
||||
if (res.isDefined) {
|
||||
val s = res.get
|
||||
if (s != null && s.nonEmpty)
|
||||
s.head.text
|
||||
else
|
||||
null
|
||||
} else null
|
||||
}
|
||||
|
||||
def validate_Date(year: String, month: String, day: String): String = {
|
||||
try {
|
||||
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
|
||||
|
||||
} catch {
|
||||
case _: Throwable => null
|
||||
}
|
||||
}
|
||||
|
||||
def generateNextArticle(): PMArticle = {
|
||||
|
||||
var currentSubject: PMSubject = null
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currentGrant: PMGrant = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType: String = null
|
||||
|
||||
while (xml.hasNext) {
|
||||
xml.next match {
|
||||
case EvElemStart(_, label, attrs, _) =>
|
||||
currNode = label
|
||||
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "PublicationType" | "DescriptorName" =>
|
||||
currentSubject = new PMSubject
|
||||
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" =>
|
||||
if (currentArticle.getDate == null)
|
||||
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode != null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle == null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription == null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ArticleId" =>
|
||||
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume(text.trim)
|
||||
case "Issue" => currentJournal.setIssue(text.trim)
|
||||
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
}
|
||||
case "ForeName" =>
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle == null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
}
|
@ -0,0 +1,345 @@
|
||||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.datacite.{DataciteToOAFTransformation, DataciteType}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
|
||||
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
||||
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
||||
import eu.dnetlib.dhp.utils.{DHPUtils, ISLookupClientFactory}
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import java.text.SimpleDateFormat
|
||||
|
||||
class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ"
|
||||
val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN)
|
||||
|
||||
val SCHOLIX_RESOURCE_PATH_NAME = "scholixResource"
|
||||
val DATACITE_OAF_PATH_NAME = "dataciteOAFUpdate"
|
||||
val PID_MAP_PATH_NAME = "pidMap"
|
||||
val RESOLVED_REL_PATH_NAME = "resolvedRelation"
|
||||
val SCHOLIX_PATH_NAME = "scholix"
|
||||
|
||||
def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
|
||||
def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
|
||||
def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME"
|
||||
def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
|
||||
def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME"
|
||||
|
||||
/** Utility to parse Date in ISO8601 to epochMillis
|
||||
* @param inputDate The String represents an input date in ISO8601
|
||||
* @return The relative epochMillis of parsed date
|
||||
*/
|
||||
def ISO8601toEpochMillis(inputDate: String): Long = {
|
||||
simpleFormatter.parse(inputDate).getTime
|
||||
}
|
||||
|
||||
/** This method tries to retrieve the last collection date from all datacite
|
||||
* records in HDFS.
|
||||
* This method should be called before indexing scholexplorer to retrieve
|
||||
* the delta of Datacite record to download, since from the generation of
|
||||
* raw graph to the generation of Scholexplorer sometimes it takes 20 days
|
||||
* @param spark
|
||||
* @param entitiesPath
|
||||
* @return the last collection date from the current scholexplorer Graph of the datacite records
|
||||
*/
|
||||
def retrieveLastCollectedFrom(spark: SparkSession, entitiesPath: String): Long = {
|
||||
log.info("Retrieve last entities collected From")
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
import spark.implicits._
|
||||
|
||||
val entitiesDS = spark.read
|
||||
.load(s"$entitiesPath/*")
|
||||
.as[Oaf]
|
||||
.filter(o => o.isInstanceOf[Result])
|
||||
.map(r => r.asInstanceOf[Result])
|
||||
|
||||
val date = entitiesDS
|
||||
.filter(r => r.getDateofcollection != null)
|
||||
.map(_.getDateofcollection)
|
||||
.select(max("value"))
|
||||
.first
|
||||
.getString(0)
|
||||
|
||||
ISO8601toEpochMillis(date) / 1000
|
||||
}
|
||||
|
||||
/** The method of update Datacite relationships on Scholexplorer
|
||||
* needs some utilities data structures
|
||||
* One is the scholixResource DS that stores all the nodes in the Scholix Graph
|
||||
* in format ScholixResource
|
||||
* @param summaryPath the path of the summary in Scholix
|
||||
* @param workingPath the working path
|
||||
* @param spark the spark session
|
||||
*/
|
||||
def generateScholixResource(
|
||||
summaryPath: String,
|
||||
workingPath: String,
|
||||
spark: SparkSession
|
||||
): Unit = {
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
|
||||
log.info("Convert All summary to ScholixResource")
|
||||
spark.read
|
||||
.load(summaryPath)
|
||||
.as[ScholixSummary]
|
||||
.map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder)
|
||||
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${scholixResourcePath(workingPath)}_native")
|
||||
}
|
||||
|
||||
/** This method convert the new Datacite Resource into Scholix Resource
|
||||
* Needed to fill the source and the type of Scholix Relationships
|
||||
* @param workingPath the Working Path
|
||||
* @param spark The spark Session
|
||||
*/
|
||||
def addMissingScholixResource(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
import spark.implicits._
|
||||
|
||||
spark.read
|
||||
.load(dataciteOAFPath(workingPath))
|
||||
.as[Oaf]
|
||||
.filter(_.isInstanceOf[Result])
|
||||
.map(_.asInstanceOf[Result])
|
||||
.map(ScholixUtils.generateScholixResourceFromResult)
|
||||
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${scholixResourcePath(workingPath)}_update")
|
||||
|
||||
val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource]
|
||||
val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource]
|
||||
val graph = update
|
||||
.union(native)
|
||||
.groupByKey(_.getDnetIdentifier)
|
||||
.reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b)
|
||||
.map(_._2)
|
||||
graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph")
|
||||
}
|
||||
|
||||
/** This method get and Transform only datacite records with
|
||||
* timestamp greater than timestamp
|
||||
* @param datacitePath the datacite input Path
|
||||
* @param timestamp the timestamp
|
||||
* @param workingPath the working path where save the generated Dataset
|
||||
* @param spark SparkSession
|
||||
* @param vocabularies Vocabularies needed for transformation
|
||||
*/
|
||||
|
||||
def getDataciteUpdate(
|
||||
datacitePath: String,
|
||||
timestamp: Long,
|
||||
workingPath: String,
|
||||
spark: SparkSession,
|
||||
vocabularies: VocabularyGroup
|
||||
): Long = {
|
||||
import spark.implicits._
|
||||
val ds = spark.read.load(datacitePath).as[DataciteType]
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
val total = ds.filter(_.timestamp >= timestamp).count()
|
||||
if (total > 0) {
|
||||
ds.filter(_.timestamp >= timestamp)
|
||||
.flatMap(d =>
|
||||
DataciteToOAFTransformation
|
||||
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)
|
||||
)
|
||||
.flatMap(i => fixRelations(i))
|
||||
.filter(i => i != null)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(dataciteOAFPath(workingPath))
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
/** After added the new ScholixResource, we need to update the scholix Pid Map
|
||||
* to intersected with the new Datacite Relations
|
||||
*
|
||||
* @param workingPath The working Path starting from save the new Map
|
||||
* @param spark the spark session
|
||||
*/
|
||||
def generatePidMap(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
import spark.implicits._
|
||||
spark.read
|
||||
.load(s"${scholixResourcePath(workingPath)}_graph")
|
||||
.as[ScholixResource]
|
||||
.flatMap(r =>
|
||||
r.getIdentifier.asScala
|
||||
.map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
|
||||
.map(t => (t, r.getDnetIdentifier))
|
||||
)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.groupByKey(_._1)
|
||||
.reduceGroups((a, b) => if (a != null && a._2 != null) a else b)
|
||||
.map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(pidMapPath(workingPath))
|
||||
}
|
||||
|
||||
/** This method resolve the datacite relation and filter the resolved
|
||||
* relation
|
||||
* @param workingPath the working path
|
||||
* @param spark the spark session
|
||||
*/
|
||||
|
||||
def resolveUpdateRelation(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
import spark.implicits._
|
||||
|
||||
val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)]
|
||||
|
||||
val unresolvedRelations: Dataset[(String, Relation)] = spark.read
|
||||
.load(dataciteOAFPath(workingPath))
|
||||
.as[Oaf]
|
||||
.filter(_.isInstanceOf[Relation])
|
||||
.map(_.asInstanceOf[Relation])
|
||||
.map { r =>
|
||||
if (r.getSource.startsWith("unresolved"))
|
||||
(r.getSource, r)
|
||||
else
|
||||
(r.getTarget, r)
|
||||
}(Encoders.tuple(Encoders.STRING, relationEncoder))
|
||||
|
||||
unresolvedRelations
|
||||
.joinWith(pidMap, unresolvedRelations("_1").equalTo(pidMap("_1")))
|
||||
.map(t => {
|
||||
val r = t._1._2
|
||||
val resolvedIdentifier = t._2._2
|
||||
if (r.getSource.startsWith("unresolved"))
|
||||
r.setSource(resolvedIdentifier)
|
||||
else
|
||||
r.setTarget(resolvedIdentifier)
|
||||
r
|
||||
})(relationEncoder)
|
||||
.filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved")))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(resolvedRelationPath(workingPath))
|
||||
}
|
||||
|
||||
/** This method generate scholix starting from resolved relation
|
||||
*
|
||||
* @param workingPath
|
||||
* @param spark
|
||||
*/
|
||||
def generateScholixUpdate(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val intermediateEncoder: Encoder[(String, Scholix)] =
|
||||
Encoders.tuple(Encoders.STRING, scholixEncoder)
|
||||
|
||||
val relations: Dataset[(String, Relation)] = spark.read
|
||||
.load(resolvedRelationPath(workingPath))
|
||||
.as[Relation]
|
||||
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder))
|
||||
|
||||
val id_summary: Dataset[(String, ScholixResource)] = spark.read
|
||||
.load(s"${scholixResourcePath(workingPath)}_graph")
|
||||
.as[ScholixResource]
|
||||
.map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
|
||||
|
||||
id_summary.cache()
|
||||
|
||||
relations
|
||||
.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner")
|
||||
.map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/scholix_one_verse")
|
||||
|
||||
val source_scholix: Dataset[(String, Scholix)] =
|
||||
spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)]
|
||||
|
||||
source_scholix
|
||||
.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner")
|
||||
.map(t => {
|
||||
val target: ScholixResource = t._2._2
|
||||
val scholix: Scholix = t._1._2
|
||||
ScholixUtils.generateCompleteScholix(scholix, target)
|
||||
})(scholixEncoder)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/scholix")
|
||||
}
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"SourcePath is '$sourcePath'")
|
||||
|
||||
val datacitePath = parser.get("datacitePath")
|
||||
log.info(s"DatacitePath is '$datacitePath'")
|
||||
|
||||
val workingPath = parser.get("workingSupportPath")
|
||||
log.info(s"workingPath is '$workingPath'")
|
||||
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
require(vocabularies != null)
|
||||
|
||||
val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
|
||||
log.info(s"updateDS is '$updateDS'")
|
||||
|
||||
var lastCollectionDate = 0L
|
||||
if (updateDS) {
|
||||
generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark)
|
||||
log.info("Retrieve last entities collected From starting from scholix Graph")
|
||||
lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
|
||||
} else {
|
||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||
fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true)
|
||||
fs.rename(
|
||||
new Path(s"${scholixResourcePath(workingPath)}_graph"),
|
||||
new Path(s"${scholixResourcePath(workingPath)}_native")
|
||||
)
|
||||
lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
|
||||
}
|
||||
|
||||
val numRecords =
|
||||
getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
|
||||
if (numRecords > 0) {
|
||||
addMissingScholixResource(workingPath, spark)
|
||||
generatePidMap(workingPath, spark)
|
||||
resolveUpdateRelation(workingPath, spark)
|
||||
generateScholixUpdate(workingPath, spark)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object SparkRetrieveDataciteDelta {
|
||||
val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new SparkRetrieveDataciteDelta(
|
||||
"/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json",
|
||||
args,
|
||||
log
|
||||
).initialize().run()
|
||||
}
|
||||
}
|
@ -0,0 +1,138 @@
|
||||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocalFileSystem;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
|
||||
public class ReadCOCITest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(ReadCOCITest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(ReadCOCITest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(ReadCOCITest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(ReadCOCITest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void testReadCOCI() throws Exception {
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
.getPath();
|
||||
|
||||
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input5.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input5.gz"));
|
||||
|
||||
ReadCOCI
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-workingPath",
|
||||
workingDir.toString() + "/COCI",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/COCI_json/",
|
||||
"-inputFile", "input1;input2;input3;input4;input5"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<COCI> tmp = sc
|
||||
.textFile(workingDir.toString() + "/COCI_json/*/")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
|
||||
|
||||
Assertions.assertEquals(24, tmp.count());
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(c -> c.getCiting().equals("10.1207/s15327647jcd3,4-01")).count());
|
||||
|
||||
Assertions.assertEquals(8, tmp.filter(c -> c.getCiting().indexOf(".refs") > -1).count());
|
||||
}
|
||||
|
||||
}
|
@ -1,190 +0,0 @@
|
||||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.junit.jupiter.api.Assertions._
|
||||
import org.junit.jupiter.api.extension.ExtendWith
|
||||
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||
import org.mockito.junit.jupiter.MockitoExtension
|
||||
|
||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
||||
import java.util.zip.GZIPInputStream
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
import scala.xml.pull.XMLEventReader
|
||||
|
||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||
class BioScholixTest extends AbstractVocabularyTest{
|
||||
|
||||
|
||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
|
||||
|
||||
@BeforeEach
|
||||
def setUp() :Unit = {
|
||||
|
||||
super.setUpVocabulary()
|
||||
}
|
||||
|
||||
class BufferedReaderIterator(reader: BufferedReader) extends Iterator[String] {
|
||||
override def hasNext() = reader.ready
|
||||
override def next() = reader.readLine()
|
||||
}
|
||||
|
||||
object GzFileIterator {
|
||||
def apply(is: InputStream, encoding: String) = {
|
||||
new BufferedReaderIterator(
|
||||
new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new GZIPInputStream(
|
||||
is), encoding)))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testEBIData() = {
|
||||
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
|
||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
||||
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testPubmedToOaf(): Unit = {
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
|
||||
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
assertEquals(10, r.size)
|
||||
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
|
||||
println(mapper.writeValueAsString(r.head))
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testPDBToOAF():Unit = {
|
||||
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")).mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
||||
|
||||
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertNotNull(r))
|
||||
|
||||
println(result.count(o => o.isInstanceOf[Relation]))
|
||||
println(mapper.writeValueAsString(result.head))
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testUNIprotToOAF():Unit = {
|
||||
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")).mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
||||
|
||||
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertNotNull(r))
|
||||
|
||||
println(result.count(o => o.isInstanceOf[Relation]))
|
||||
println(mapper.writeValueAsString(result.head))
|
||||
|
||||
}
|
||||
|
||||
case class EBILinks(relType:String, date:String, title:String, pmid:String, targetPid:String, targetPidType:String) {}
|
||||
|
||||
def parse_ebi_links(input:String):List[EBILinks] ={
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "publication" \"pmid").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target",JObject(target)) <- link
|
||||
JField("RelationshipType",JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate",JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier",JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testCrossrefLinksToOAF():Unit = {
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")).mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
|
||||
val result:List[Oaf] =records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
||||
|
||||
assertNotNull(result)
|
||||
assertTrue(result.nonEmpty)
|
||||
|
||||
println(mapper.writeValueAsString(result.head))
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testEBILinksToOAF():Unit = {
|
||||
val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), "UTF-8")
|
||||
val data = iterator.next()
|
||||
|
||||
val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf)
|
||||
print(res.length)
|
||||
|
||||
|
||||
println(mapper.writeValueAsString(res.head))
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def scholixResolvedToOAF():Unit ={
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")).mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
val l:List[ScholixResolved] = records.lines.map{input =>
|
||||
lazy val json = parse(input)
|
||||
json.extract[ScholixResolved]
|
||||
}.toList
|
||||
|
||||
|
||||
val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,4 @@
|
||||
{"50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1": [{"id": "influence", "unit": [{"value": "6.63451994567e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.348694533145", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.16094680115e-09", "key": "score"}]}]}
|
||||
{"50|dedup_wf_001::05b1f8ce98702f69d07aa5f0429de1e3": [{"id": "influence", "unit": [{"value": "6.25057357279e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "7.0208", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.40234462244e-08", "key": "score"}]}]}
|
||||
{"50|dedup_wf_001::08823c8f5c3ca2eae523817036cdda67": [{"id": "influence", "unit": [{"value": "5.54921449123e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.53012887452e-10", "key": "score"}]}]}
|
||||
{"50|dedup_wf_001::0e72b399325d6efcbe3271891a1dfe4c": [{"id": "influence", "unit": [{"value": "1.63466096315e-08", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "20.9870879741", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.49501495323e-08", "key": "score"}]}]}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,38 +1,39 @@
|
||||
{"doi":"10.3390/s18072310","level1":"engineering and technology","level2":"nano-technology","level3":"nanoscience & nanotechnology"}
|
||||
{"doi":"10.1111/1365-2656.12831\u000210.17863/cam.24369","level1":"social sciences","level2":"psychology and cognitive sciences","level3":"NULL"}
|
||||
{"doi":"10.3929/ethz-b-000187584\u000210.1002/chem.201701644","level1":"natural sciences","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.1080/01913123.2017.1367361","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
|
||||
{"doi":"10.1051/e3sconf/20199207011","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"environmental sciences"}
|
||||
{"doi":"10.1038/onc.2015.333","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
|
||||
{"doi":"10.1093/mnras/staa256","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
|
||||
{"doi":"10.1016/j.jclepro.2018.07.166","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
|
||||
{"doi":"10.1103/physrevlett.125.037403","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
|
||||
{"doi":"10.1080/03602532.2017.1316285","level1":"natural sciences","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.1001/jamanetworkopen.2019.1868","level1":"medical and health sciences","level2":"other medical science","level3":"health policy & services"}
|
||||
{"doi":"10.1128/mra.00874-18","level1":"natural sciences","level2":"biological sciences","level3":"plant biology & botany"}
|
||||
{"doi":"10.1016/j.nancom.2018.03.001","level1":"engineering and technology","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.1112/topo.12174","level1":"natural sciences","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.12688/wellcomeopenres.15846.1","level1":"medical and health sciences","level2":"health sciences","level3":"NULL"}
|
||||
{"doi":"10.21468/scipostphys.3.1.001","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
|
||||
{"doi":"10.1088/1741-4326/ab6c77","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
|
||||
{"doi":"10.1109/tpwrs.2019.2944747","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
|
||||
{"doi":"10.1016/j.expthermflusci.2019.109994\u000210.17863/cam.46212","level1":"engineering and technology","level2":"mechanical engineering","level3":"mechanical engineering & transports"}
|
||||
{"doi":"10.1109/tc.2018.2860012","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"computer hardware & architecture"}
|
||||
{"doi":"10.1002/mma.6622","level1":"natural sciences","level2":"mathematics","level3":"numerical & computational mathematics"}
|
||||
{"doi":"10.1051/radiopro/2020020","level1":"natural sciences","level2":"chemical sciences","level3":"NULL"}
|
||||
{"doi":"10.1007/s12268-019-1003-4","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
|
||||
{"doi":"10.3390/cancers12010236","level1":"medical and health sciences","level2":"health sciences","level3":"biochemistry & molecular biology"}
|
||||
{"doi":"10.6084/m9.figshare.9912614\u000210.6084/m9.figshare.9912614.v1\u000210.1080/00268976.2019.1665199","level1":"natural sciences","level2":"chemical sciences","level3":"physical chemistry"}
|
||||
{"doi":"10.1175/jpo-d-17-0239.1","level1":"natural sciences","level2":"biological sciences","level3":"marine biology & hydrobiology"}
|
||||
{"doi":"10.1007/s13218-020-00674-7","level1":"engineering and technology","level2":"industrial biotechnology","level3":"industrial engineering & automation"}
|
||||
{"doi":"10.1016/j.psyneuen.2016.02.003\u000210.1016/j.psyneuen.2016.02.00310.7892/boris.78886\u000210.7892/boris.78886","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
|
||||
{"doi":"10.1109/ted.2018.2813542","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
|
||||
{"doi":"10.3989/scimar.04739.25a","level1":"natural sciences","level2":"biological sciences","level3":"NULL"}
|
||||
{"doi":"10.3390/su12187503","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"NULL"}
|
||||
{"doi":"10.1016/j.ccell.2018.08.017","level1":"medical and health sciences","level2":"basic medicine","level3":"biochemistry & molecular biology"}
|
||||
{"doi":"10.1103/physrevresearch.2.023322","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
|
||||
{"doi":"10.1039/c8cp03234c","level1":"natural sciences","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.5281/zenodo.3696557\u000210.5281/zenodo.3696556\u000210.1109/jsac.2016.2545384","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"networking & telecommunications"}
|
||||
{"doi":"10.1038/ng.3667\u000210.1038/ng.3667.\u000210.17615/tct6-4m26\u000210.17863/cam.15649","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"}
|
||||
{"doi":"10.1016/j.jclepro.2019.119065","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
|
||||
{"doi":"10.1111/pce.13392","level1":"agricultural and veterinary sciences","level2":"agriculture, forestry, and fisheries","level3":"agronomy & agriculture"}
|
||||
{"doi":"10.1080/1536383x.2020.1868997","level1":"02 engineering and technology","level2":"0210 nano-technology","level3":"021001 nanoscience & nanotechnology"}
|
||||
{"doi":"10.1080/1536383x.2020.1868997","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry"}
|
||||
{"doi":"10.1186/s40425-019-0732-8","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis"}
|
||||
{"doi":"10.1186/s40425-019-0732-8","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1007/s10482-021-01529-3","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1007/s10482-021-01529-3","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030306 microbiology"}
|
||||
{"doi":"10.1155/2021/6643273","level1":"01 natural sciences","level2":"0103 physical sciences","level3":"010301 acoustics"}
|
||||
{"doi":"10.1155/2021/6643273","level1":"02 engineering and technology","level2":"0209 industrial biotechnology","level3":"020901 industrial engineering & automation"}
|
||||
{"doi":"10.12737/article_5d6613dbf2ad51.82646096","level1":"02 engineering and technology","level2":"0210 nano-technology","level3":"021001 nanoscience & nanotechnology"}
|
||||
{"doi":"10.12737/article_5d6613dbf2ad51.82646096","level1":"01 natural sciences","level2":"0103 physical sciences","level3":"010302 applied physics"}
|
||||
{"doi":"10.1216/jie.2020.32.457","level1":"01 natural sciences","level2":"0101 mathematics","level3":"010101 applied mathematics"}
|
||||
{"doi":"10.1216/jie.2020.32.457","level1":"01 natural sciences","level2":"0101 mathematics","level3":"010102 general mathematics"}
|
||||
{"doi":"10.3934/naco.2021021","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021103 operations research"}
|
||||
{"doi":"10.3934/naco.2021021","level1":"02 engineering and technology","level2":"0209 industrial biotechnology","level3":"020901 industrial engineering & automation"}
|
||||
{"doi":"10.1080/1034912x.2021.1910933","level1":"05 social sciences","level2":"050301 education","level3":"050301 education"}
|
||||
{"doi":"10.1080/1034912x.2021.1910933","level1":"05 social sciences","level2":"0501 psychology and cognitive sciences","level3":"050104 developmental & child psychology"}
|
||||
{"doi":"10.1016/j.rtbm.2020.100596","level1":"05 social sciences","level2":"0502 economics and business","level3":"050211 marketing"}
|
||||
{"doi":"10.1016/j.rtbm.2020.100596","level1":"05 social sciences","level2":"0502 economics and business","level3":"050212 sport, leisure & tourism"}
|
||||
{"doi":"10.14807/ijmp.v11i8.1220","level1":"05 social sciences","level2":"0502 economics and business","level3":"050211 marketing"}
|
||||
{"doi":"10.14807/ijmp.v11i8.1220","level1":"05 social sciences","level2":"0502 economics and business","level3":"050203 business & management"}
|
||||
{"doi":"10.1007/s13205-020-02415-x","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1007/s13205-020-02415-x","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030302 biochemistry & molecular biology"}
|
||||
{"doi":"10.3390/s18072310","level1":"04 agricultural and veterinary sciences","level2":"0404 agricultural biotechnology","level3":"040502 food science"}
|
||||
{"doi":"10.3390/s18072310","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030309 nutrition & dietetics"}
|
||||
{"doi":"10.1063/5.0032658","level1":"01 natural sciences","level2":"0103 physical sciences","level3":"010304 chemical physics"}
|
||||
{"doi":"10.1063/5.0032658","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry"}
|
||||
{"doi":"10.1145/3411174.3411195","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020201 artificial intelligence & image processing"}
|
||||
{"doi":"10.1145/3411174.3411195","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020206 networking & telecommunications"}
|
||||
{"doi":"10.1021/acs.joc.0c02755","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010405 organic chemistry"}
|
||||
{"doi":"10.1021/acs.joc.0c02755","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry"}
|
||||
{"doi":"10.1002/jcp.28608","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis"}
|
||||
{"doi":"10.1002/jcp.28608","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1097/cmr.0000000000000579","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis"}
|
||||
{"doi":"10.1097/cmr.0000000000000579","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1007/s11164-020-04383-6","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010405 organic chemistry"}
|
||||
{"doi":"10.1007/s11164-020-04383-6","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry"}
|
||||
{"doi":"10.1016/j.actpsy.2020.103155","level1":"05 social sciences","level2":"0501 psychology and cognitive sciences","level3":"050105 experimental psychology"}
|
||||
{"doi":"10.1016/j.actpsy.2020.103155","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030217 neurology & neurosurgery"}
|
||||
{"doi":"10.1109/memea49120.2020.9137187","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020208 electrical & electronic engineering"}
|
@ -0,0 +1,39 @@
|
||||
10.1080/1536383x.2020.1868997,02 engineering and technology,0210 nano-technology,021001 nanoscience & nanotechnology
|
||||
10.1080/1536383x.2020.1868997,01 natural sciences,0104 chemical sciences,010402 general chemistry
|
||||
10.1186/s40425-019-0732-8,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis
|
||||
10.1186/s40425-019-0732-8,03 medical and health sciences,0301 basic medicine,030304 developmental biology
|
||||
10.1007/s10482-021-01529-3,03 medical and health sciences,0301 basic medicine,030304 developmental biology
|
||||
10.1007/s10482-021-01529-3,03 medical and health sciences,0301 basic medicine,030306 microbiology
|
||||
10.1155/2021/6643273,01 natural sciences,0103 physical sciences,010301 acoustics
|
||||
10.1155/2021/6643273,02 engineering and technology,0209 industrial biotechnology,020901 industrial engineering & automation
|
||||
10.12737/article_5d6613dbf2ad51.82646096,02 engineering and technology,0210 nano-technology,021001 nanoscience & nanotechnology
|
||||
10.12737/article_5d6613dbf2ad51.82646096,01 natural sciences,0103 physical sciences,010302 applied physics
|
||||
10.1216/jie.2020.32.457,01 natural sciences,0101 mathematics,010101 applied mathematics
|
||||
10.1216/jie.2020.32.457,01 natural sciences,0101 mathematics,010102 general mathematics
|
||||
10.3934/naco.2021021,02 engineering and technology,0211 other engineering and technologies,021103 operations research
|
||||
10.3934/naco.2021021,02 engineering and technology,0209 industrial biotechnology,020901 industrial engineering & automation
|
||||
10.1080/1034912x.2021.1910933,05 social sciences,050301 education,050301 education
|
||||
10.1080/1034912x.2021.1910933,05 social sciences,0501 psychology and cognitive sciences,050104 developmental & child psychology
|
||||
10.1016/j.rtbm.2020.100596,05 social sciences,0502 economics and business,050211 marketing
|
||||
10.1016/j.rtbm.2020.100596,05 social sciences,0502 economics and business,"050212 sport, leisure & tourism"
|
||||
10.14807/ijmp.v11i8.1220,05 social sciences,0502 economics and business,050211 marketing
|
||||
10.14807/ijmp.v11i8.1220,05 social sciences,0502 economics and business,050203 business & management
|
||||
10.1007/s13205-020-02415-x,03 medical and health sciences,0303 health sciences,030304 developmental biology
|
||||
10.1007/s13205-020-02415-x,03 medical and health sciences,0303 health sciences,030302 biochemistry & molecular biology
|
||||
10.3390/foods10040865,04 agricultural and veterinary sciences,0404 agricultural biotechnology,040502 food science
|
||||
10.3390/foods10040865,03 medical and health sciences,0303 health sciences,030309 nutrition & dietetics
|
||||
10.1063/5.0032658,01 natural sciences,0103 physical sciences,010304 chemical physics
|
||||
10.1063/5.0032658,01 natural sciences,0104 chemical sciences,010402 general chemistry
|
||||
10.1145/3411174.3411195,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020201 artificial intelligence & image processing
|
||||
10.1145/3411174.3411195,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020206 networking & telecommunications
|
||||
10.1021/acs.joc.0c02755,01 natural sciences,0104 chemical sciences,010405 organic chemistry
|
||||
10.1021/acs.joc.0c02755,01 natural sciences,0104 chemical sciences,010402 general chemistry
|
||||
10.1002/jcp.28608,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis
|
||||
10.1002/jcp.28608,03 medical and health sciences,0301 basic medicine,030304 developmental biology
|
||||
10.1097/cmr.0000000000000579,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis
|
||||
10.1097/cmr.0000000000000579,03 medical and health sciences,0301 basic medicine,030304 developmental biology
|
||||
10.1007/s11164-020-04383-6,01 natural sciences,0104 chemical sciences,010405 organic chemistry
|
||||
10.1007/s11164-020-04383-6,01 natural sciences,0104 chemical sciences,010402 general chemistry
|
||||
10.1016/j.actpsy.2020.103155,05 social sciences,0501 psychology and cognitive sciences,050105 experimental psychology
|
||||
10.1016/j.actpsy.2020.103155,03 medical and health sciences,0302 clinical medicine,030217 neurology & neurosurgery
|
||||
10.1109/memea49120.2020.9137187,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020208 electrical & electronic engineering
|
|
|
@ -0,0 +1,37 @@
|
||||
{"doi":"10.1001/amaguidesnewsletters.2019.mayjun02","sbj":"10. No inequality"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2019.novdec01","sbj":"10. No inequality"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2019.sepoct02","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2019.sepoct02","sbj":"8. Economic growth"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.janfeb01","sbj":"8. Economic growth"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.janfeb02","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.janfeb02","sbj":"8. Economic growth"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.julaug01","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.marapr01","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.mayjun01","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.mayjun02","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.mayjun02","sbj":"10. No inequality"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2021.julaug01","sbj":"1. No poverty"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2021.mayjune01","sbj":"10. No inequality"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2021.mayjune02","sbj":"10. No inequality"}
|
||||
{"doi":"10.4336/2021.pfb.41e201902078","sbj":"15. Life on land"}
|
||||
{"doi":"10.4337/ejeep.2019.00045","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2019.00050","sbj":"1. No poverty"}
|
||||
{"doi":"10.4337/ejeep.2019.0045","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2019.0050","sbj":"1. No poverty"}
|
||||
{"doi":"10.4337/ejeep.2019.0051","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2019.0052","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2020.0058","sbj":"1. No poverty"}
|
||||
{"doi":"10.4337/ejeep.2020.0058","sbj":"10. No inequality"}
|
||||
{"doi":"10.4337/ejeep.2020.0060","sbj":"10. No inequality"}
|
||||
{"doi":"10.4337/ejeep.2020.0065","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2020.02.03","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2020.02.05","sbj":"8. Economic growth"}
|
||||
{"doi":"10.4337/ejeep.2020.02.06","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2020.02.09","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/roke.2020.01.01","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/roke.2020.01.03","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/roke.2020.01.05","sbj":"1. No poverty"}
|
||||
{"doi":"10.4337/roke.2020.01.05","sbj":"8. Economic growth"}
|
||||
{"doi":"10.4337/roke.2020.01.07","sbj":"8. Economic growth"}
|
||||
{"doi":"10.4337/roke.2020.02.03","sbj":"8. Economic growth"}
|
||||
{"doi":"10.3390/s18072310","sbj":"1. No poverty"}
|
@ -0,0 +1,37 @@
|
||||
10.1001/amaguidesnewsletters.2019.mayjun02,10. No inequality
|
||||
10.1001/amaguidesnewsletters.2019.novdec01,10. No inequality
|
||||
10.1001/amaguidesnewsletters.2019.sepoct02,3. Good health
|
||||
10.1001/amaguidesnewsletters.2019.sepoct02,8. Economic growth
|
||||
10.1001/amaguidesnewsletters.2020.janfeb01,8. Economic growth
|
||||
10.1001/amaguidesnewsletters.2020.janfeb02,3. Good health
|
||||
10.1001/amaguidesnewsletters.2020.janfeb02,8. Economic growth
|
||||
10.1001/amaguidesnewsletters.2020.julaug01,3. Good health
|
||||
10.1001/amaguidesnewsletters.2020.marapr01,3. Good health
|
||||
10.1001/amaguidesnewsletters.2020.mayjun01,3. Good health
|
||||
10.1001/amaguidesnewsletters.2020.mayjun02,16. Peace & justice
|
||||
10.1001/amaguidesnewsletters.2020.mayjun02,10. No inequality
|
||||
10.1001/amaguidesnewsletters.2021.julaug01,1. No poverty
|
||||
10.1001/amaguidesnewsletters.2021.mayjune01,10. No inequality
|
||||
10.1001/amaguidesnewsletters.2021.mayjune02,10. No inequality
|
||||
10.4336/2021.pfb.41e201902078,15. Life on land
|
||||
10.4337/ejeep.2019.00045,16. Peace & justice
|
||||
10.4337/ejeep.2019.00050,1. No poverty
|
||||
10.4337/ejeep.2019.0045,16. Peace & justice
|
||||
10.4337/ejeep.2019.0050,1. No poverty
|
||||
10.4337/ejeep.2019.0051,16. Peace & justice
|
||||
10.4337/ejeep.2019.0052,16. Peace & justice
|
||||
10.4337/ejeep.2020.0058,1. No poverty
|
||||
10.4337/ejeep.2020.0058,10. No inequality
|
||||
10.4337/ejeep.2020.0060,10. No inequality
|
||||
10.4337/ejeep.2020.0065,16. Peace & justice
|
||||
10.4337/ejeep.2020.02.03,16. Peace & justice
|
||||
10.4337/ejeep.2020.02.05,8. Economic growth
|
||||
10.4337/ejeep.2020.02.06,16. Peace & justice
|
||||
10.4337/ejeep.2020.02.09,16. Peace & justice
|
||||
10.4337/roke.2020.01.01,16. Peace & justice
|
||||
10.4337/roke.2020.01.03,16. Peace & justice
|
||||
10.4337/roke.2020.01.05,1. No poverty
|
||||
10.4337/roke.2020.01.05,8. Economic growth
|
||||
10.4337/roke.2020.01.07,8. Economic growth
|
||||
10.4337/roke.2020.02.03,8. Economic growth
|
||||
10.4337/roke.2020.02.04,1. No poverty
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue