forked from D-Net/dnet-hadoop
Merge branch 'beta' into hierarchical_orgs_relations
This commit is contained in:
commit
89c7313fc5
|
@ -3,8 +3,6 @@
|
|||
*.iws
|
||||
*.ipr
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
*~
|
||||
.vscode
|
||||
.metals
|
||||
|
@ -27,4 +25,4 @@ spark-warehouse
|
|||
/**/job-override.properties
|
||||
/**/*.log
|
||||
/**/.factorypath
|
||||
|
||||
/**/.scalafmt.conf
|
||||
|
|
|
@ -22,9 +22,20 @@
|
|||
<id>dnet45-releases</id>
|
||||
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
|
||||
</repository>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
<build>
|
||||
<extensions>
|
||||
<extension>
|
||||
<groupId>org.apache.maven.wagon</groupId>
|
||||
<artifactId>wagon-ssh</artifactId>
|
||||
<version>2.10</version>
|
||||
</extension>
|
||||
</extensions>
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
|
@ -35,7 +46,7 @@
|
|||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-site-plugin</artifactId>
|
||||
<version>3.7.1</version>
|
||||
<version>3.9.1</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
|
@ -43,6 +54,7 @@
|
|||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
|
||||
</properties>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,21 @@
|
|||
style = defaultWithAlign
|
||||
|
||||
align.openParenCallSite = false
|
||||
align.openParenDefnSite = false
|
||||
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
|
||||
continuationIndent.callSite = 2
|
||||
continuationIndent.defnSite = 2
|
||||
danglingParentheses = true
|
||||
indentOperator = spray
|
||||
maxColumn = 120
|
||||
newlines.alwaysBeforeTopLevelStatements = true
|
||||
project.excludeFilters = [".*\\.sbt"]
|
||||
rewrite.rules = [AvoidInfix]
|
||||
rewrite.rules = [ExpandImportSelectors]
|
||||
rewrite.rules = [RedundantBraces]
|
||||
rewrite.rules = [RedundantParens]
|
||||
rewrite.rules = [SortImports]
|
||||
rewrite.rules = [SortModifiers]
|
||||
rewrite.rules = [PreferCurlyFors]
|
||||
spaces.inImportCurlyBraces = false
|
||||
unindentTopLevelOperators = true
|
|
@ -0,0 +1,21 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||
name="DHP-Aggregation">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.8</version>
|
||||
</skin>
|
||||
<poweredBy>
|
||||
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||
</poweredBy>
|
||||
<body>
|
||||
<links>
|
||||
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||
</links>
|
||||
<menu ref="modules" />
|
||||
<menu ref="reports"/>
|
||||
</body>
|
||||
</project>
|
|
@ -10,6 +10,9 @@
|
|||
<packaging>pom</packaging>
|
||||
|
||||
<description>This module is a container for the build tools used in dnet-hadoop</description>
|
||||
<properties>
|
||||
<maven.javadoc.skip>true</maven.javadoc.skip>
|
||||
</properties>
|
||||
|
||||
<modules>
|
||||
<module>dhp-code-style</module>
|
||||
|
@ -17,4 +20,12 @@
|
|||
<module>dhp-build-properties-maven-plugin</module>
|
||||
</modules>
|
||||
|
||||
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-build/</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
|
||||
name="DHP-Aggregation">
|
||||
<skin>
|
||||
<groupId>org.apache.maven.skins</groupId>
|
||||
<artifactId>maven-fluido-skin</artifactId>
|
||||
<version>1.8</version>
|
||||
</skin>
|
||||
<poweredBy>
|
||||
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
|
||||
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
|
||||
</poweredBy>
|
||||
<body>
|
||||
<links>
|
||||
<item name="Code" href="https://code-repo.d4science.org/" />
|
||||
</links>
|
||||
|
||||
<menu ref="modules" />
|
||||
<menu ref="reports"/>
|
||||
</body>
|
||||
</project>
|
|
@ -13,7 +13,51 @@
|
|||
<artifactId>dhp-common</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<distributionManagement>
|
||||
<site>
|
||||
<id>DHPSite</id>
|
||||
<url>${dhp.site.stage.path}/dhp-common</url>
|
||||
</site>
|
||||
</distributionManagement>
|
||||
|
||||
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>${net.alchim31.maven.version}</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-doc</id>
|
||||
<phase>process-resources</phase> <!-- or wherever -->
|
||||
<goals>
|
||||
<goal>doc</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
|
|
|
@ -1,413 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class GraphResultMapper implements Serializable {
|
||||
|
||||
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||
E in) {
|
||||
|
||||
CommunityResult out = new CommunityResult();
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
switch (ort.get().getClassid()) {
|
||||
case "publication":
|
||||
Optional<Journal> journal = Optional
|
||||
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
||||
if (journal.isPresent()) {
|
||||
Journal j = journal.get();
|
||||
Container c = new Container();
|
||||
c.setConferencedate(j.getConferencedate());
|
||||
c.setConferenceplace(j.getConferenceplace());
|
||||
c.setEdition(j.getEdition());
|
||||
c.setEp(j.getEp());
|
||||
c.setIss(j.getIss());
|
||||
c.setIssnLinking(j.getIssnLinking());
|
||||
c.setIssnOnline(j.getIssnOnline());
|
||||
c.setIssnPrinted(j.getIssnPrinted());
|
||||
c.setName(j.getName());
|
||||
c.setSp(j.getSp());
|
||||
c.setVol(j.getVol());
|
||||
out.setContainer(c);
|
||||
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
|
||||
}
|
||||
break;
|
||||
case "dataset":
|
||||
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
|
||||
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
|
||||
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
|
||||
|
||||
out
|
||||
.setGeolocation(
|
||||
Optional
|
||||
.ofNullable(id.getGeolocation())
|
||||
.map(
|
||||
igl -> igl
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(gli -> {
|
||||
GeoLocation gl = new GeoLocation();
|
||||
gl.setBox(gli.getBox());
|
||||
gl.setPlace(gli.getPlace());
|
||||
gl.setPoint(gli.getPoint());
|
||||
return gl;
|
||||
})
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "software":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
|
||||
Optional
|
||||
.ofNullable(is.getCodeRepositoryUrl())
|
||||
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(is.getDocumentationUrl())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setDocumentationUrl(
|
||||
value
|
||||
.stream()
|
||||
.map(Field::getValue)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(is.getProgrammingLanguage())
|
||||
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
|
||||
|
||||
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "other":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
|
||||
out
|
||||
.setContactgroup(
|
||||
Optional
|
||||
.ofNullable(ir.getContactgroup())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out
|
||||
.setContactperson(
|
||||
Optional
|
||||
.ofNullable(ir.getContactperson())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setTool(
|
||||
Optional
|
||||
.ofNullable(ir.getTool())
|
||||
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getAuthor())
|
||||
.ifPresent(
|
||||
ats -> out.setAuthor(ats.stream().map(GraphResultMapper::getAuthor).collect(Collectors.toList())));
|
||||
|
||||
// I do not map Access Right UNKNOWN or OTHER
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||
if (oar.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
|
||||
out
|
||||
.setBestaccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
final List<String> contributorList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getContributor())
|
||||
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||
out.setContributor(contributorList);
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getCountry())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setCountry(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
c -> {
|
||||
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
|
||||
return null;
|
||||
}
|
||||
Country country = new Country();
|
||||
country.setCode(c.getClassid());
|
||||
country.setLabel(c.getClassname());
|
||||
Optional
|
||||
.ofNullable(c.getDataInfo())
|
||||
.ifPresent(
|
||||
provenance -> country
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
provenance
|
||||
.getProvenanceaction()
|
||||
.getClassname(),
|
||||
c.getDataInfo().getTrust())));
|
||||
return country;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
final List<String> coverageList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getCoverage())
|
||||
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||
out.setCoverage(coverageList);
|
||||
|
||||
out.setDateofcollection(input.getDateofcollection());
|
||||
|
||||
final List<String> descriptionList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getDescription())
|
||||
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||
out.setDescription(descriptionList);
|
||||
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||
if (oStr.isPresent()) {
|
||||
out.setEmbargoenddate(oStr.get().getValue());
|
||||
}
|
||||
|
||||
final List<String> formatList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getFormat())
|
||||
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||
out.setFormat(formatList);
|
||||
out.setId(input.getId());
|
||||
out.setOriginalId(input.getOriginalId());
|
||||
|
||||
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
|
||||
.ofNullable(input.getInstance());
|
||||
|
||||
if (oInst.isPresent()) {
|
||||
out
|
||||
.setInstance(
|
||||
oInst.get().stream().map(GraphResultMapper::getInstance).collect(Collectors.toList()));
|
||||
|
||||
}
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||
if (oL.isPresent()) {
|
||||
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
|
||||
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
|
||||
}
|
||||
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||
if (oLong.isPresent()) {
|
||||
out.setLastupdatetimestamp(oLong.get());
|
||||
}
|
||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||
if (otitle.isPresent()) {
|
||||
List<StructuredProperty> iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setMaintitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||
.collect(Collectors.toList());
|
||||
if (!iTitle.isEmpty()) {
|
||||
out.setSubtitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
List<ControlledField> pids = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getPid())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.stream()
|
||||
.forEach(
|
||||
p -> pids
|
||||
.add(
|
||||
ControlledField
|
||||
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
|
||||
out.setPid(pids);
|
||||
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublicationdate(oStr.get().getValue());
|
||||
}
|
||||
oStr = Optional.ofNullable(input.getPublisher());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublisher(oStr.get().getValue());
|
||||
}
|
||||
|
||||
List<String> sourceList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSource())
|
||||
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
|
||||
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
||||
List<Subject> subjectList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSubject())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.forEach(s -> subjectList.add(getSubject(s))));
|
||||
|
||||
out.setSubjects(subjectList);
|
||||
|
||||
out.setType(input.getResulttype().getClassid());
|
||||
}
|
||||
|
||||
out
|
||||
.setCollectedfrom(
|
||||
input
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return out;
|
||||
|
||||
}
|
||||
|
||||
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
CommunityInstance instance = new CommunityInstance();
|
||||
|
||||
setCommonValue(i, instance);
|
||||
|
||||
instance
|
||||
.setCollectedfrom(
|
||||
KeyValue
|
||||
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
||||
|
||||
instance
|
||||
.setHostedby(
|
||||
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
||||
|
||||
return instance;
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
|
||||
.ofNullable(i.getAccessright());
|
||||
if (opAr.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
|
||||
instance
|
||||
.setAccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getLicense())
|
||||
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getDateofacceptance())
|
||||
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getRefereed())
|
||||
.ifPresent(value -> instance.setRefereed(value.getClassname()));
|
||||
Optional
|
||||
.ofNullable(i.getInstancetype())
|
||||
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||
|
||||
}
|
||||
|
||||
private static Subject getSubject(StructuredProperty s) {
|
||||
Subject subject = new Subject();
|
||||
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
|
||||
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
Provenance p = new Provenance();
|
||||
p.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||
p.setTrust(di.get().getTrust());
|
||||
subject.setProvenance(p);
|
||||
}
|
||||
|
||||
return subject;
|
||||
}
|
||||
|
||||
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||
Author a = new Author();
|
||||
a.setFullname(oa.getFullname());
|
||||
a.setName(oa.getName());
|
||||
a.setSurname(oa.getSurname());
|
||||
a.setRank(oa.getRank());
|
||||
|
||||
Optional<List<StructuredProperty>> oPids = Optional
|
||||
.ofNullable(oa.getPid());
|
||||
if (oPids.isPresent()) {
|
||||
Pid pid = getOrcid(oPids.get());
|
||||
if (pid != null) {
|
||||
a.setPid(pid);
|
||||
}
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()),
|
||||
Provenance
|
||||
.newInstance(
|
||||
di.get().getProvenanceaction().getClassname(),
|
||||
di.get().getTrust()));
|
||||
} else {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue())
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -57,9 +57,17 @@ public class VocabularyGroup implements Serializable {
|
|||
final String syn = arr[2].trim();
|
||||
|
||||
vocs.addSynonyms(vocId, termId, syn);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// add the term names as synonyms
|
||||
vocs.vocs.values().forEach(voc -> {
|
||||
voc.getTerms().values().forEach(term -> {
|
||||
voc.addSynonym(term.getName().toLowerCase(), term.getId());
|
||||
});
|
||||
});
|
||||
|
||||
return vocs;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
|
@ -38,7 +38,7 @@ public class DispatchEntitiesSparkJob {
|
|||
.requireNonNull(
|
||||
DispatchEntitiesSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/dispatch_entities_parameters.json")));
|
||||
"/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json")));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
||||
|
@ -53,7 +53,7 @@ public class GroupEntitiesSparkJob {
|
|||
.toString(
|
||||
GroupEntitiesSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/group_graph_entities_parameters.json"));
|
||||
"/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
|
@ -16,6 +16,8 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
@ -86,6 +88,22 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
}
|
||||
|
||||
public static <T extends Oaf> boolean filter(T value) {
|
||||
if (Boolean.TRUE
|
||||
.equals(
|
||||
Optional
|
||||
.ofNullable(value)
|
||||
.map(
|
||||
o -> Optional
|
||||
.ofNullable(o.getDataInfo())
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getInvisible())
|
||||
.orElse(true))
|
||||
.orElse(true))
|
||||
.orElse(true))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to evaluate here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -115,7 +133,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
return true;
|
||||
}
|
||||
|
||||
public static <T extends Oaf> T cleanup(T value) {
|
||||
public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -212,6 +230,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getFormat())) {
|
||||
r
|
||||
.setFormat(
|
||||
r
|
||||
.getFormat()
|
||||
.stream()
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getDescription())) {
|
||||
r
|
||||
.setDescription(
|
||||
|
@ -234,6 +261,38 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
if (Objects.nonNull(r.getInstance())) {
|
||||
|
||||
for (Instance i : r.getInstance()) {
|
||||
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
|
||||
if (r instanceof Publication) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
} else if (r instanceof Dataset) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
} else if (r instanceof Software) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
} else if (r instanceof OtherResearchProduct) {
|
||||
i
|
||||
.setInstancetype(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||
}
|
||||
}
|
||||
|
||||
if (Objects.nonNull(i.getPid())) {
|
||||
i.setPid(processPidCleaning(i.getPid()));
|
||||
}
|
||||
|
|
|
@ -47,6 +47,17 @@ public class OafMapperUtils {
|
|||
}
|
||||
|
||||
public static Result mergeResults(Result left, Result right) {
|
||||
|
||||
final boolean leftFromDelegatedAuthority = isFromDelegatedAuthority(left);
|
||||
final boolean rightFromDelegatedAuthority = isFromDelegatedAuthority(right);
|
||||
|
||||
if (leftFromDelegatedAuthority && !rightFromDelegatedAuthority) {
|
||||
return left;
|
||||
}
|
||||
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
|
||||
return right;
|
||||
}
|
||||
|
||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
||||
left.mergeFrom(right);
|
||||
return left;
|
||||
|
@ -56,6 +67,18 @@ public class OafMapperUtils {
|
|||
}
|
||||
}
|
||||
|
||||
private static boolean isFromDelegatedAuthority(Result r) {
|
||||
return Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.filter(i -> Objects.nonNull(i.getCollectedfrom()))
|
||||
.map(i -> i.getCollectedfrom().getKey())
|
||||
.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
|
||||
.orElse(false);
|
||||
}
|
||||
|
||||
public static KeyValue keyValue(final String k, final String v) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
kv.setKey(k);
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
package eu.dnetlib.dhp.application
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
/** This is the main Interface SparkApplication
|
||||
* where all the Spark Scala class should inherit
|
||||
*/
|
||||
trait SparkScalaApplication {
|
||||
|
||||
/** This is the path in the classpath of the json
|
||||
* describes all the argument needed to run
|
||||
*/
|
||||
val propertyPath: String
|
||||
|
||||
/** Utility to parse the arguments using the
|
||||
* property json in the classpath identified from
|
||||
* the variable propertyPath
|
||||
*
|
||||
* @param args the list of arguments
|
||||
*/
|
||||
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
parser
|
||||
}
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
def run(): Unit
|
||||
}
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.slf4j.Logger
|
||||
|
||||
abstract class AbstractScalaApplication(
|
||||
val propertyPath: String,
|
||||
val args: Array[String],
|
||||
log: Logger
|
||||
) extends SparkScalaApplication {
|
||||
|
||||
var parser: ArgumentApplicationParser = null
|
||||
|
||||
var spark: SparkSession = null
|
||||
|
||||
def initialize(): SparkScalaApplication = {
|
||||
parser = parseArguments(args)
|
||||
spark = createSparkSession()
|
||||
this
|
||||
}
|
||||
|
||||
/** Utility for creating a spark session starting from parser
|
||||
*
|
||||
* @return a spark Session
|
||||
*/
|
||||
private def createSparkSession(): SparkSession = {
|
||||
require(parser != null)
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val master = parser.get("master")
|
||||
log.info(s"Creating Spark session: Master: $master")
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,442 @@
|
|||
package eu.dnetlib.dhp.sx.graph.scholix
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.sx.scholix._
|
||||
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
|
||||
object ScholixUtils extends Serializable {
|
||||
|
||||
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
|
||||
|
||||
val DATE_RELATION_KEY: String = "RelationDate"
|
||||
|
||||
case class RelationVocabulary(original: String, inverse: String) {}
|
||||
|
||||
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
|
||||
|
||||
val relations: Map[String, RelationVocabulary] = {
|
||||
val input = Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
|
||||
)
|
||||
.mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
json.extract[Map[String, RelationVocabulary]]
|
||||
}
|
||||
|
||||
def extractRelationDate(relation: Relation): String = {
|
||||
|
||||
if (relation.getProperties == null || !relation.getProperties.isEmpty)
|
||||
null
|
||||
else {
|
||||
val date = relation.getProperties.asScala
|
||||
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
|
||||
.map(p => p.getValue)
|
||||
if (date.isDefined)
|
||||
date.get
|
||||
else
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
def extractRelationDate(summary: ScholixSummary): String = {
|
||||
|
||||
if (summary.getDate == null || summary.getDate.isEmpty)
|
||||
null
|
||||
else {
|
||||
summary.getDate.get(0)
|
||||
}
|
||||
}
|
||||
|
||||
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
|
||||
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
|
||||
|
||||
}
|
||||
|
||||
def generateScholixResourceFromResult(r: Result): ScholixResource = {
|
||||
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
|
||||
}
|
||||
|
||||
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
|
||||
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
|
||||
override def zero: RelatedEntities = null
|
||||
|
||||
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
|
||||
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
|
||||
|
||||
if (b == null)
|
||||
RelatedEntities(a._1, relatedDataset, relatedPublication)
|
||||
else
|
||||
RelatedEntities(
|
||||
a._1,
|
||||
b.relatedDataset + relatedDataset,
|
||||
b.relatedPublication + relatedPublication
|
||||
)
|
||||
}
|
||||
|
||||
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
|
||||
if (b1 != null && b2 != null)
|
||||
RelatedEntities(
|
||||
b1.id,
|
||||
b1.relatedDataset + b2.relatedDataset,
|
||||
b1.relatedPublication + b2.relatedPublication
|
||||
)
|
||||
else if (b1 != null)
|
||||
b1
|
||||
else
|
||||
b2
|
||||
}
|
||||
|
||||
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
|
||||
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
|
||||
}
|
||||
|
||||
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
|
||||
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
|
||||
override def zero: Scholix = null
|
||||
|
||||
def scholix_complete(s: Scholix): Boolean = {
|
||||
if (s == null || s.getIdentifier == null) {
|
||||
false
|
||||
} else if (s.getSource == null || s.getTarget == null) {
|
||||
false
|
||||
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
|
||||
false
|
||||
else
|
||||
true
|
||||
}
|
||||
|
||||
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
|
||||
if (scholix_complete(b)) b else a._2
|
||||
}
|
||||
|
||||
override def merge(b1: Scholix, b2: Scholix): Scholix = {
|
||||
if (scholix_complete(b1)) b1 else b2
|
||||
}
|
||||
|
||||
override def finish(reduction: Scholix): Scholix = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
}
|
||||
|
||||
def createInverseScholixRelation(scholix: Scholix): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(inverseRelationShip(scholix.getRelationship))
|
||||
s.setSource(scholix.getTarget)
|
||||
s.setTarget(scholix.getSource)
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
|
||||
}
|
||||
|
||||
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
|
||||
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
|
||||
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
|
||||
}(collection.breakOut)
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
|
||||
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
|
||||
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
|
||||
new ScholixEntityId(
|
||||
d.getDatasourceName,
|
||||
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
|
||||
)
|
||||
}(collection.breakOut)
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
|
||||
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
|
||||
|
||||
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
|
||||
new ScholixEntityId(
|
||||
c.getValue,
|
||||
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||
)
|
||||
}.toList
|
||||
l
|
||||
} else List()
|
||||
}
|
||||
|
||||
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(scholix.getRelationship)
|
||||
s.setSource(scholix.getSource)
|
||||
s.setTarget(generateScholixResourceFromSummary(target))
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
}
|
||||
|
||||
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
|
||||
val s = new Scholix
|
||||
s.setPublicationDate(scholix.getPublicationDate)
|
||||
s.setPublisher(scholix.getPublisher)
|
||||
s.setLinkprovider(scholix.getLinkprovider)
|
||||
s.setRelationship(scholix.getRelationship)
|
||||
s.setSource(scholix.getSource)
|
||||
s.setTarget(target)
|
||||
s.setIdentifier(
|
||||
DHPUtils.md5(
|
||||
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
|
||||
)
|
||||
)
|
||||
s
|
||||
}
|
||||
|
||||
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
|
||||
val r = new ScholixResource
|
||||
r.setIdentifier(summaryObject.getLocalIdentifier)
|
||||
r.setDnetIdentifier(summaryObject.getId)
|
||||
|
||||
r.setObjectType(summaryObject.getTypology.toString)
|
||||
r.setObjectSubType(summaryObject.getSubType)
|
||||
|
||||
if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty)
|
||||
r.setTitle(summaryObject.getTitle.get(0))
|
||||
|
||||
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
|
||||
val l: List[ScholixEntityId] =
|
||||
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
|
||||
if (l.nonEmpty)
|
||||
r.setCreator(l.asJava)
|
||||
}
|
||||
|
||||
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
|
||||
r.setPublicationDate(summaryObject.getDate.get(0))
|
||||
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
|
||||
val plist: List[ScholixEntityId] =
|
||||
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
|
||||
|
||||
if (plist.nonEmpty)
|
||||
r.setPublisher(plist.asJava)
|
||||
}
|
||||
|
||||
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
|
||||
|
||||
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
|
||||
.map(c =>
|
||||
new ScholixCollectedFrom(
|
||||
new ScholixEntityId(
|
||||
c.getDatasourceName,
|
||||
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
|
||||
),
|
||||
"collected",
|
||||
"complete"
|
||||
)
|
||||
)
|
||||
.toList
|
||||
|
||||
if (l.nonEmpty)
|
||||
r.setCollectedFrom(l.asJava)
|
||||
|
||||
}
|
||||
r
|
||||
}
|
||||
|
||||
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
|
||||
if (relation == null || source == null)
|
||||
return null
|
||||
val s = new Scholix
|
||||
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
|
||||
if (l.isEmpty)
|
||||
l = extractCollectedFrom(source)
|
||||
if (l.isEmpty)
|
||||
return null
|
||||
s.setLinkprovider(l.asJava)
|
||||
var d = extractRelationDate(relation)
|
||||
if (d == null)
|
||||
d = source.getPublicationDate
|
||||
|
||||
s.setPublicationDate(d)
|
||||
|
||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||
s.setPublisher(source.getPublisher)
|
||||
}
|
||||
|
||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||
if (semanticRelation == null)
|
||||
return null
|
||||
s.setRelationship(
|
||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||
)
|
||||
s.setSource(source)
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
|
||||
|
||||
if (relation == null || source == null)
|
||||
return null
|
||||
|
||||
val s = new Scholix
|
||||
|
||||
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
|
||||
if (l.isEmpty)
|
||||
l = extractCollectedFrom(source)
|
||||
if (l.isEmpty)
|
||||
return null
|
||||
|
||||
s.setLinkprovider(l.asJava)
|
||||
|
||||
var d = extractRelationDate(relation)
|
||||
if (d == null)
|
||||
d = extractRelationDate(source)
|
||||
|
||||
s.setPublicationDate(d)
|
||||
|
||||
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
|
||||
val l: List[ScholixEntityId] = source.getPublisher.asScala
|
||||
.map { p =>
|
||||
new ScholixEntityId(p, null)
|
||||
}(collection.breakOut)
|
||||
|
||||
if (l.nonEmpty)
|
||||
s.setPublisher(l.asJava)
|
||||
}
|
||||
|
||||
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
|
||||
if (semanticRelation == null)
|
||||
return null
|
||||
s.setRelationship(
|
||||
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
|
||||
)
|
||||
s.setSource(generateScholixResourceFromSummary(source))
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
def findURLForPID(
|
||||
pidValue: List[StructuredProperty],
|
||||
urls: List[String]
|
||||
): List[(StructuredProperty, String)] = {
|
||||
pidValue.map { p =>
|
||||
val pv = p.getValue
|
||||
|
||||
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
|
||||
(p, r.orNull)
|
||||
}
|
||||
}
|
||||
|
||||
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
|
||||
if (r.getInstance() == null || r.getInstance().isEmpty)
|
||||
return List()
|
||||
r.getInstance()
|
||||
.asScala
|
||||
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
|
||||
.filter(i => i.getPid != null && i.getUrl != null)
|
||||
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
|
||||
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
|
||||
.distinct
|
||||
.toList
|
||||
}
|
||||
|
||||
def resultToSummary(r: Result): ScholixSummary = {
|
||||
val s = new ScholixSummary
|
||||
s.setId(r.getId)
|
||||
if (r.getPid == null || r.getPid.isEmpty)
|
||||
return null
|
||||
|
||||
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r)
|
||||
if (persistentIdentifiers.isEmpty)
|
||||
return null
|
||||
s.setLocalIdentifier(persistentIdentifiers.asJava)
|
||||
if (r.isInstanceOf[Publication])
|
||||
s.setTypology(Typology.publication)
|
||||
else
|
||||
s.setTypology(Typology.dataset)
|
||||
|
||||
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
|
||||
|
||||
if (r.getTitle != null && r.getTitle.asScala.nonEmpty) {
|
||||
val titles: List[String] = r.getTitle.asScala.map(t => t.getValue).toList
|
||||
if (titles.nonEmpty)
|
||||
s.setTitle(titles.asJava)
|
||||
else
|
||||
return null
|
||||
}
|
||||
|
||||
if (r.getAuthor != null && !r.getAuthor.isEmpty) {
|
||||
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList
|
||||
if (authors.nonEmpty)
|
||||
s.setAuthor(authors.asJava)
|
||||
}
|
||||
if (r.getInstance() != null) {
|
||||
val dt: List[String] = r
|
||||
.getInstance()
|
||||
.asScala
|
||||
.filter(i => i.getDateofacceptance != null)
|
||||
.map(i => i.getDateofacceptance.getValue)
|
||||
.toList
|
||||
if (dt.nonEmpty)
|
||||
s.setDate(dt.distinct.asJava)
|
||||
}
|
||||
if (r.getDescription != null && !r.getDescription.isEmpty) {
|
||||
val d = r.getDescription.asScala.find(f => f != null && f.getValue != null)
|
||||
if (d.isDefined)
|
||||
s.setDescription(d.get.getValue)
|
||||
}
|
||||
|
||||
if (r.getSubject != null && !r.getSubject.isEmpty) {
|
||||
val subjects: List[SchemeValue] = r.getSubject.asScala
|
||||
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
|
||||
.toList
|
||||
if (subjects.nonEmpty)
|
||||
s.setSubject(subjects.asJava)
|
||||
}
|
||||
|
||||
if (r.getPublisher != null)
|
||||
s.setPublisher(List(r.getPublisher.getValue).asJava)
|
||||
|
||||
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
|
||||
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
|
||||
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
|
||||
.toList
|
||||
if (cf.nonEmpty)
|
||||
s.setDatasources(cf.distinct.asJava)
|
||||
}
|
||||
|
||||
s.setRelatedDatasets(0)
|
||||
s.setRelatedPublications(0)
|
||||
s.setRelatedUnknown(0)
|
||||
|
||||
s
|
||||
}
|
||||
|
||||
}
|
|
@ -107,7 +107,7 @@ class OafMapperUtilsTest {
|
|||
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
|
||||
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
|
||||
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
|
||||
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
|
||||
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
|
||||
|
@ -185,6 +185,22 @@ class OafMapperUtilsTest {
|
|||
.getClassid());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testDelegatedAuthority() throws IOException {
|
||||
Dataset d1 = read("dataset_2.json", Dataset.class);
|
||||
Dataset d2 = read("dataset_delegated.json", Dataset.class);
|
||||
|
||||
assertEquals(1, d2.getCollectedfrom().size());
|
||||
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
|
||||
|
||||
Result res = OafMapperUtils.mergeResults(d1, d2);
|
||||
|
||||
assertEquals(d2, res);
|
||||
|
||||
System.out.println(OBJECT_MAPPER.writeValueAsString(res));
|
||||
|
||||
}
|
||||
|
||||
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
|
||||
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
|
|
@ -1 +1,140 @@
|
|||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
|
||||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
|
||||
"resuttype": {"classid": "dataset"},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2011.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
|
||||
"value": "Repository B"
|
||||
}
|
||||
],
|
||||
"instance": [
|
||||
{
|
||||
"refereed": {
|
||||
"classid": "0000",
|
||||
"classname": "UNKNOWN",
|
||||
"schemeid": "dnet:review_levels",
|
||||
"schemename": "dnet:review_levels"
|
||||
},
|
||||
"hostedby": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"accessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"processingchargecurrency": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "EUR"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "Digital Object Identifier",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1371/journal.pone.0085605"
|
||||
}
|
||||
],
|
||||
"distributionlocation": "",
|
||||
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
|
||||
"alternateIdentifier": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "pmid",
|
||||
"classname": "PubMed ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "24454899.0"
|
||||
}
|
||||
],
|
||||
"collectedfrom": {
|
||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
|
||||
"value": "Repository B"
|
||||
},
|
||||
"processingchargeamount": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "1022.02"
|
||||
},
|
||||
"instancetype": {
|
||||
"classid": "0004",
|
||||
"classname": "Conference object",
|
||||
"schemeid": "dnet:publication_resource",
|
||||
"schemename": "dnet:publication_resource"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,140 @@
|
|||
{
|
||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
|
||||
"resuttype": {"classid": "dataset"},
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {"classid": "doi"},
|
||||
"value": "10.1016/j.cmet.2011.03.013"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "urn"},
|
||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "scp-number"},
|
||||
"value": "79953761260"
|
||||
},
|
||||
{
|
||||
"qualifier": {"classid": "pmc"},
|
||||
"value": "21459329"
|
||||
}
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
}
|
||||
],
|
||||
"instance": [
|
||||
{
|
||||
"refereed": {
|
||||
"classid": "0000",
|
||||
"classname": "UNKNOWN",
|
||||
"schemeid": "dnet:review_levels",
|
||||
"schemename": "dnet:review_levels"
|
||||
},
|
||||
"hostedby": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"accessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"processingchargecurrency": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "EUR"
|
||||
},
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "Digital Object Identifier",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1371/journal.pone.0085605"
|
||||
}
|
||||
],
|
||||
"distributionlocation": "",
|
||||
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
|
||||
"alternateIdentifier": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "pmid",
|
||||
"classname": "PubMed ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "24454899.0"
|
||||
}
|
||||
],
|
||||
"collectedfrom": {
|
||||
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
|
||||
"value": "Zenodo"
|
||||
},
|
||||
"processingchargeamount": {
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": true,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "1022.02"
|
||||
},
|
||||
"instancetype": {
|
||||
"classid": "0004",
|
||||
"classname": "Conference object",
|
||||
"schemeid": "dnet:publication_resource",
|
||||
"schemename": "dnet:publication_resource"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
package eu.dnetlib.dhp.actionmanager;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
|
@ -11,19 +11,29 @@ import org.apache.spark.sql.SparkSession;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class Constants {
|
||||
|
||||
public static final String DOI = "doi";
|
||||
public static final String DOI_CLASSNAME = "Digital Object Identifier";
|
||||
|
||||
public static final String DEFAULT_DELIMITER = ",";
|
||||
|
||||
public static final String UPDATE_DATA_INFO_TYPE = "update";
|
||||
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
|
||||
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
|
||||
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
|
||||
public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
|
||||
|
||||
public static final String FOS_CLASS_ID = "FOS";
|
||||
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
|
||||
|
||||
public static final String SDG_CLASS_ID = "SDG";
|
||||
public static final String SDG_CLASS_NAME = "Sustainable Development Goals";
|
||||
|
||||
public static final String NULL = "NULL";
|
||||
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
@ -46,4 +56,37 @@ public class Constants {
|
|||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
public static StructuredProperty getSubject(String sbj, String classid, String classname,
|
||||
String diqualifierclassid) {
|
||||
if (sbj.equals(NULL))
|
||||
return null;
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(sbj);
|
||||
sp
|
||||
.setQualifier(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
classid,
|
||||
classname,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
||||
sp
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
diqualifierclassid,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
""));
|
||||
|
||||
return sp;
|
||||
|
||||
}
|
||||
}
|
|
@ -1,86 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* Just collects all the atomic actions produced for the different results and saves them in
|
||||
* outputpath for the ActionSet
|
||||
*/
|
||||
public class CollectAndSave implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
CollectAndSave.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json")));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath {}: ", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
collectAndSave(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
sc
|
||||
.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
||||
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Subset of the information of the generic results that are needed to create the atomic action
|
||||
*/
|
||||
public class PreparedResult implements Serializable {
|
||||
private String id; // openaire id
|
||||
private String value; // doi
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -15,7 +16,6 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -24,11 +24,15 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
|
||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
|
@ -46,7 +50,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
.toString(
|
||||
SparkAtomicActionScoreJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
|
@ -65,14 +69,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
final String bipScorePath = parser.get("bipScorePath");
|
||||
log.info("bipScorePath: {}", bipScorePath);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
|
@ -80,12 +76,11 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
|
||||
prepareResults(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
|
||||
String bipScorePath, Class<I> inputClazz) {
|
||||
private static <I extends Result> void prepareResults(SparkSession spark, String bipScorePath, String outputPath) {
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
|
@ -101,41 +96,19 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
return bs;
|
||||
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
|
||||
|
||||
Dataset<I> results = readPath(spark, inputPath, inputClazz);
|
||||
|
||||
results.createOrReplaceTempView("result");
|
||||
|
||||
Dataset<PreparedResult> preparedResult = spark
|
||||
.sql(
|
||||
"select pIde.value value, id " +
|
||||
"from result " +
|
||||
"lateral view explode (pid) p as pIde " +
|
||||
"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
|
||||
.as(Encoders.bean(PreparedResult.class));
|
||||
|
||||
bipScores
|
||||
.joinWith(
|
||||
preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
|
||||
"inner")
|
||||
.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
|
||||
BipScore ret = value._1();
|
||||
ret.setId(value._2().getId());
|
||||
return ret;
|
||||
}, Encoders.bean(BipScore.class))
|
||||
.groupByKey((MapFunction<BipScore, String>) BipScore::getId, Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
|
||||
Result ret = new Result();
|
||||
ret.setDataInfo(getDataInfo());
|
||||
BipScore first = it.next();
|
||||
ret.setId(first.getId());
|
||||
|
||||
ret.setMeasures(getMeasure(first));
|
||||
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
|
||||
.map((MapFunction<BipScore, Result>) bs -> {
|
||||
Result ret = new Result();
|
||||
|
||||
ret.setId(bs.getId());
|
||||
|
||||
ret.setMeasures(getMeasure(bs));
|
||||
|
||||
return ret;
|
||||
}, Encoders.bean(Result.class))
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(inputClazz, p))
|
||||
.map(p -> new AtomicAction(Result.class, p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
|
@ -159,7 +132,21 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
KeyValue kv = new KeyValue();
|
||||
kv.setValue(unit.getValue());
|
||||
kv.setKey(unit.getKey());
|
||||
kv.setDataInfo(getDataInfo());
|
||||
kv
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
UPDATE_MEASURE_BIP_CLASS_ID,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
""));
|
||||
return kv;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
@ -168,21 +155,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static DataInfo getDataInfo() {
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInferred(false);
|
||||
di.setInvisible(false);
|
||||
di.setDeletedbyinference(false);
|
||||
di.setTrust("");
|
||||
Qualifier qualifier = new Qualifier();
|
||||
qualifier.setClassid("sysimport:actionset");
|
||||
qualifier.setClassname("Harvested");
|
||||
qualifier.setSchemename("dnet:provenanceActions");
|
||||
qualifier.setSchemeid("dnet:provenanceActions");
|
||||
di.setProvenanceaction(qualifier);
|
||||
return di;
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
package eu.dnetlib.dhp.actionmanager.bipmodel;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
|
@ -1,77 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.GetCSV;
|
||||
|
||||
public class GetFOSData implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GetFOSData.class);
|
||||
|
||||
public static final char DEFAULT_DELIMITER = '\t';
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
GetFOSData.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json"))));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
// the path where the original fos csv file is stored
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}", sourcePath);
|
||||
|
||||
// the path where to put the file as json
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
log.info("hdfsNameNode {}", hdfsNameNode);
|
||||
|
||||
final String classForName = parser.get("classForName");
|
||||
log.info("classForName {}", classForName);
|
||||
|
||||
final char delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.map(s -> s.charAt(0))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
log.info("delimiter {}", delimiter);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
new GetFOSData().doRewrite(sourcePath, outputPath, classForName, delimiter, fileSystem);
|
||||
|
||||
}
|
||||
|
||||
public void doRewrite(String inputPath, String outputFile, String classForName, char delimiter, FileSystem fs)
|
||||
throws IOException, ClassNotFoundException {
|
||||
|
||||
// reads the csv and writes it as its json equivalent
|
||||
try (InputStreamReader reader = new InputStreamReader(fs.open(new Path(inputPath)))) {
|
||||
GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class GetFOSSparkJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GetFOSSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
GetFOSSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
// the path where the original fos csv file is stored
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}", sourcePath);
|
||||
|
||||
// the path where to put the file as json
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
getFOS(
|
||||
spark,
|
||||
sourcePath,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void getFOS(SparkSession spark, String sourcePath, String outputPath, String delimiter) {
|
||||
Dataset<Row> fosData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(sourcePath);
|
||||
|
||||
fosData.map((MapFunction<Row, FOSDataModel>) r -> {
|
||||
FOSDataModel fosDataModel = new FOSDataModel();
|
||||
fosDataModel.setDoi(r.getString(0).toLowerCase());
|
||||
fosDataModel.setLevel1(r.getString(1));
|
||||
fosDataModel.setLevel2(r.getString(2));
|
||||
fosDataModel.setLevel3(r.getString(3));
|
||||
return fosDataModel;
|
||||
}, Encoders.bean(FOSDataModel.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class GetSDGSparkJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GetSDGSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
GetSDGSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
// the path where the original fos csv file is stored
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath {}", sourcePath);
|
||||
|
||||
// the path where to put the file as json
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}", outputPath);
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
getSDG(
|
||||
spark,
|
||||
sourcePath,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void getSDG(SparkSession spark, String sourcePath, String outputPath, String delimiter) {
|
||||
Dataset<Row> sdgData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(sourcePath);
|
||||
|
||||
sdgData.map((MapFunction<Row, SDGDataModel>) r -> {
|
||||
SDGDataModel sdgDataModel = new SDGDataModel();
|
||||
sdgDataModel.setDoi(r.getString(0).toLowerCase());
|
||||
sdgDataModel.setSbj(r.getString(1));
|
||||
|
||||
return sdgDataModel;
|
||||
}, Encoders.bean(SDGDataModel.class))
|
||||
.filter((FilterFunction<SDGDataModel>) sdg -> sdg.getSbj() != null)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,17 +1,17 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
|
||||
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.UPDATE_CLASS_NAME;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.UPDATE_CLASS_NAME;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.hdfs.client.HdfsUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
@ -24,14 +24,16 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipDeserialize;
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipScore;
|
||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
|
||||
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Measure;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
|
@ -40,7 +42,7 @@ public class PrepareBipFinder implements Serializable {
|
|||
private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
|
@ -76,7 +78,7 @@ public class PrepareBipFinder implements Serializable {
|
|||
});
|
||||
}
|
||||
|
||||
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath) {
|
||||
private static void prepareResults(SparkSession spark, String inputPath, String outputPath) {
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
|
@ -89,13 +91,44 @@ public class PrepareBipFinder implements Serializable {
|
|||
BipScore bs = new BipScore();
|
||||
bs.setId(key);
|
||||
bs.setScoreList(entry.get(key));
|
||||
|
||||
return bs;
|
||||
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
|
||||
.map((MapFunction<BipScore, Result>) v -> {
|
||||
Result r = new Result();
|
||||
final String cleanedPid = CleaningFunctions.normalizePidValue(DOI, v.getId());
|
||||
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI));
|
||||
r.setMeasures(getMeasure(v));
|
||||
Instance inst = new Instance();
|
||||
inst.setMeasures(getMeasure(v));
|
||||
|
||||
inst
|
||||
.setPid(
|
||||
Arrays
|
||||
.asList(
|
||||
OafMapperUtils
|
||||
.structuredProperty(
|
||||
cleanedPid,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
DOI, DOI_CLASSNAME,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES),
|
||||
null)));
|
||||
r.setInstance(Arrays.asList(inst));
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -10,8 +10,8 @@ import java.util.stream.Collectors;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -67,20 +67,36 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) {
|
||||
Dataset<FOSDataModel> fosDataset = readPath(spark, sourcePath, FOSDataModel.class);
|
||||
|
||||
fosDataset.flatMap((FlatMapFunction<FOSDataModel, FOSDataModel>) v -> {
|
||||
List<FOSDataModel> fosList = new ArrayList<>();
|
||||
final String level1 = v.getLevel1();
|
||||
final String level2 = v.getLevel2();
|
||||
final String level3 = v.getLevel3();
|
||||
Arrays
|
||||
.stream(v.getDoi().split("\u0002"))
|
||||
.forEach(d -> fosList.add(FOSDataModel.newInstance(d, level1, level2, level3)));
|
||||
return fosList.iterator();
|
||||
}, Encoders.bean(FOSDataModel.class))
|
||||
.map((MapFunction<FOSDataModel, Result>) value -> {
|
||||
fosDataset
|
||||
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getDoi().toLowerCase(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
|
||||
Result r = new Result();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(value.getDoi(), DOI));
|
||||
r.setSubject(getSubjects(value));
|
||||
FOSDataModel first = it.next();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
|
||||
|
||||
HashSet<String> level1 = new HashSet<>();
|
||||
HashSet<String> level2 = new HashSet<>();
|
||||
HashSet<String> level3 = new HashSet<>();
|
||||
addLevels(level1, level2, level3, first);
|
||||
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||
r.setSubject(sbjs);
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
|
@ -89,45 +105,11 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
.json(outputPath + "/fos");
|
||||
}
|
||||
|
||||
private static List<StructuredProperty> getSubjects(FOSDataModel fos) {
|
||||
return Arrays
|
||||
.asList(getSubject(fos.getLevel1()), getSubject(fos.getLevel2()), getSubject(fos.getLevel3()))
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static StructuredProperty getSubject(String sbj) {
|
||||
if (sbj.equals(NULL))
|
||||
return null;
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(sbj);
|
||||
sp
|
||||
.setQualifier(
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
FOS_CLASS_ID,
|
||||
FOS_CLASS_NAME,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
||||
sp
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
UPDATE_DATA_INFO_TYPE,
|
||||
true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
UPDATE_SUBJECT_FOS_CLASS_ID,
|
||||
UPDATE_CLASS_NAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
""));
|
||||
|
||||
return sp;
|
||||
|
||||
private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3,
|
||||
FOSDataModel first) {
|
||||
level1.add(first.getLevel1());
|
||||
level2.add(first.getLevel2());
|
||||
level3.add(first.getLevel3());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class PrepareSDGSparkJob implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareSDGSparkJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareSDGSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String sourcePath = parser.get("sourcePath");
|
||||
log.info("sourcePath: {}", sourcePath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
doPrepare(
|
||||
spark,
|
||||
sourcePath,
|
||||
|
||||
outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) {
|
||||
Dataset<SDGDataModel> sdgDataset = readPath(spark, sourcePath, SDGDataModel.class);
|
||||
|
||||
sdgDataset
|
||||
.groupByKey((MapFunction<SDGDataModel, String>) r -> r.getDoi().toLowerCase(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, SDGDataModel, Result>) (k, it) -> {
|
||||
Result r = new Result();
|
||||
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
|
||||
SDGDataModel first = it.next();
|
||||
List<StructuredProperty> sbjs = new ArrayList<>();
|
||||
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
|
||||
it
|
||||
.forEachRemaining(
|
||||
s -> sbjs
|
||||
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
|
||||
r.setSubject(sbjs);
|
||||
r
|
||||
.setDataInfo(
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, true,
|
||||
false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
ModelConstants.PROVENANCE_ENRICH,
|
||||
null,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
null));
|
||||
return r;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "/sdg");
|
||||
}
|
||||
|
||||
}
|
|
@ -1,10 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -20,7 +21,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkSaveUnresolved implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSaveUnresolved.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
@ -64,10 +65,22 @@ public class SparkSaveUnresolved implements Serializable {
|
|||
.map(
|
||||
(MapFunction<String, Result>) l -> OBJECT_MAPPER.readValue(l, Result.class),
|
||||
Encoders.bean(Result.class))
|
||||
.groupByKey((MapFunction<Result, String>) r -> r.getId(), Encoders.STRING())
|
||||
.groupByKey((MapFunction<Result, String>) Result::getId, Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
|
||||
Result ret = it.next();
|
||||
it.forEachRemaining(r -> ret.mergeFrom(r));
|
||||
it.forEachRemaining(r -> {
|
||||
if (r.getInstance() != null) {
|
||||
ret.setInstance(r.getInstance());
|
||||
}
|
||||
if (r.getSubject() != null) {
|
||||
if (ret.getSubject() != null)
|
||||
ret.getSubject().addAll(r.getSubject());
|
||||
else
|
||||
ret.setSubject(r.getSubject());
|
||||
}
|
||||
|
||||
// ret.mergeFrom(r)
|
||||
});
|
||||
return ret;
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
|
|
|
@ -1,28 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Class that maps the model of the bipFinder! input data.
|
||||
* Only needed for deserialization purposes
|
||||
*/
|
||||
|
||||
public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
|
||||
|
||||
public BipDeserialize() {
|
||||
super();
|
||||
}
|
||||
|
||||
public List<Score> get(String key) {
|
||||
|
||||
if (super.get(key) == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
return super.get(key);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,30 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
|
||||
*/
|
||||
|
||||
public class BipScore implements Serializable {
|
||||
private String id; // doi
|
||||
private List<Score> scoreList; // unit as given in the inputfile
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<Score> getScoreList() {
|
||||
return scoreList;
|
||||
}
|
||||
|
||||
public void setScoreList(List<Score> scoreList) {
|
||||
this.scoreList = scoreList;
|
||||
}
|
||||
}
|
|
@ -6,19 +6,19 @@ import java.io.Serializable;
|
|||
import com.opencsv.bean.CsvBindByPosition;
|
||||
|
||||
public class FOSDataModel implements Serializable {
|
||||
@CsvBindByPosition(position = 1)
|
||||
@CsvBindByPosition(position = 0)
|
||||
// @CsvBindByName(column = "doi")
|
||||
private String doi;
|
||||
|
||||
@CsvBindByPosition(position = 2)
|
||||
@CsvBindByPosition(position = 1)
|
||||
// @CsvBindByName(column = "level1")
|
||||
private String level1;
|
||||
|
||||
@CsvBindByPosition(position = 3)
|
||||
@CsvBindByPosition(position = 2)
|
||||
// @CsvBindByName(column = "level2")
|
||||
private String level2;
|
||||
|
||||
@CsvBindByPosition(position = 4)
|
||||
@CsvBindByPosition(position = 3)
|
||||
// @CsvBindByName(column = "level3")
|
||||
private String level3;
|
||||
|
||||
|
|
|
@ -1,26 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class KeyValue implements Serializable {
|
||||
|
||||
private String key;
|
||||
private String value;
|
||||
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
public void setKey(String key) {
|
||||
this.key = key;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByPosition;
|
||||
|
||||
public class SDGDataModel implements Serializable {
|
||||
|
||||
@CsvBindByPosition(position = 0)
|
||||
// @CsvBindByName(column = "doi")
|
||||
private String doi;
|
||||
|
||||
@CsvBindByPosition(position = 1)
|
||||
// @CsvBindByName(column = "sdg")
|
||||
private String sbj;
|
||||
|
||||
public SDGDataModel() {
|
||||
|
||||
}
|
||||
|
||||
public SDGDataModel(String doi, String sbj) {
|
||||
this.doi = doi;
|
||||
this.sbj = sbj;
|
||||
|
||||
}
|
||||
|
||||
public static SDGDataModel newInstance(String d, String sbj) {
|
||||
return new SDGDataModel(d, sbj);
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getSbj() {
|
||||
return sbj;
|
||||
}
|
||||
|
||||
public void setSbj(String sbj) {
|
||||
this.sbj = sbj;
|
||||
}
|
||||
}
|
|
@ -1,30 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* represents the score in the input file
|
||||
*/
|
||||
public class Score implements Serializable {
|
||||
|
||||
private String id;
|
||||
private List<KeyValue> unit;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<KeyValue> getUnit() {
|
||||
return unit;
|
||||
}
|
||||
|
||||
public void setUnit(List<KeyValue> unit) {
|
||||
this.unit = unit;
|
||||
}
|
||||
}
|
|
@ -14,6 +14,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -21,6 +22,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
@ -83,10 +85,13 @@ public class CreateActionSetSparkJob implements Serializable {
|
|||
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
|
||||
boolean shouldDuplicateRels) {
|
||||
spark
|
||||
.sqlContext()
|
||||
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
|
||||
.read()
|
||||
.textFile(inputPath + "/*")
|
||||
.map(
|
||||
(MapFunction<String, COCI>) value -> OBJECT_MAPPER.readValue(value, COCI.class),
|
||||
Encoders.bean(COCI.class))
|
||||
.flatMap(
|
||||
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
|
||||
(FlatMapFunction<COCI, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
|
||||
Encoders.bean(Relation.class))
|
||||
.filter((FilterFunction<Relation>) value -> value != null)
|
||||
.toJavaRDD()
|
||||
|
@ -98,26 +103,29 @@ public class CreateActionSetSparkJob implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
private static List<Relation> createRelation(String value, boolean duplicate) {
|
||||
String[] line = value.split(",");
|
||||
if (!line[1].startsWith("10.")) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
private static List<Relation> createRelation(COCI value, boolean duplicate) {
|
||||
|
||||
List<Relation> relationList = new ArrayList<>();
|
||||
|
||||
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
|
||||
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
|
||||
String citing = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting()));
|
||||
final String cited = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited()));
|
||||
|
||||
relationList
|
||||
.addAll(
|
||||
getRelations(
|
||||
citing,
|
||||
cited));
|
||||
if(!citing.equals(cited)){
|
||||
relationList
|
||||
.addAll(
|
||||
getRelations(
|
||||
citing,
|
||||
cited));
|
||||
|
||||
if (duplicate && line[1].endsWith(".refs")) {
|
||||
citing = ID_PREFIX + IdentifierFactory
|
||||
.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
|
||||
relationList.addAll(getRelations(citing, cited));
|
||||
if (duplicate && value.getCiting().endsWith(".refs")) {
|
||||
citing = ID_PREFIX + IdentifierFactory
|
||||
.md5(
|
||||
CleaningFunctions
|
||||
.normalizePidValue("doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs"))));
|
||||
relationList.addAll(getRelations(citing, cited));
|
||||
}
|
||||
}
|
||||
|
||||
return relationList;
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class ReadCOCI implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ReadCOCI.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
ReadCOCI.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String[] inputFile = parser.get("inputFile").split(";");
|
||||
log.info("inputFile {}", inputFile.toString());
|
||||
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath {}", workingPath);
|
||||
|
||||
SparkConf sconf = new SparkConf();
|
||||
|
||||
final String delimiter = Optional
|
||||
.ofNullable(parser.get("delimiter"))
|
||||
.orElse(DEFAULT_DELIMITER);
|
||||
|
||||
runWithSparkSession(
|
||||
sconf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
doRead(
|
||||
spark,
|
||||
workingPath,
|
||||
inputFile,
|
||||
outputPath,
|
||||
delimiter);
|
||||
});
|
||||
}
|
||||
|
||||
private static void doRead(SparkSession spark, String workingPath, String[] inputFiles,
|
||||
String outputPath,
|
||||
String delimiter) throws IOException {
|
||||
|
||||
for(String inputFile : inputFiles){
|
||||
String p_string = workingPath + "/" + inputFile + ".gz";
|
||||
|
||||
Dataset<Row> cociData = spark
|
||||
.read()
|
||||
.format("csv")
|
||||
.option("sep", delimiter)
|
||||
.option("inferSchema", "true")
|
||||
.option("header", "true")
|
||||
.option("quotes", "\"")
|
||||
.load(p_string)
|
||||
.repartition(100);
|
||||
|
||||
cociData.map((MapFunction<Row, COCI>) row -> {
|
||||
COCI coci = new COCI();
|
||||
coci.setOci(row.getString(0));
|
||||
coci.setCiting(row.getString(1));
|
||||
coci.setCited(row.getString(2));
|
||||
return coci;
|
||||
}, Encoders.bean(COCI.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + inputFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.opencsv.bean.CsvBindByPosition;
|
||||
|
||||
public class COCI implements Serializable {
|
||||
private String oci;
|
||||
|
||||
private String citing;
|
||||
|
||||
private String cited;
|
||||
|
||||
|
||||
public String getOci() {
|
||||
return oci;
|
||||
}
|
||||
|
||||
public void setOci(String oci) {
|
||||
this.oci = oci;
|
||||
}
|
||||
|
||||
public String getCiting() {
|
||||
return citing;
|
||||
}
|
||||
|
||||
public void setCiting(String citing) {
|
||||
this.citing = citing;
|
||||
}
|
||||
|
||||
public String getCited() {
|
||||
return cited;
|
||||
}
|
||||
|
||||
public void setCited(String cited) {
|
||||
this.cited = cited;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,69 +0,0 @@
|
|||
package eu.dnetlib.dhp.actionmanager.scholix
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkCreateActionset {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
val workingDirFolder = parser.get("workingDirFolder")
|
||||
log.info(s"workingDirFolder -> $workingDirFolder")
|
||||
|
||||
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
|
||||
implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
|
||||
|
||||
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
.flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
|
||||
|
||||
|
||||
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
|
||||
|
||||
log.info("extract source and target Identifier involved in relations")
|
||||
|
||||
|
||||
log.info("save relation filtered")
|
||||
|
||||
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
|
||||
|
||||
log.info("saving entities")
|
||||
|
||||
val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
|
||||
|
||||
entities
|
||||
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
|
||||
.map(p => p._1._2)
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
||||
}
|
||||
|
||||
}
|
|
@ -1,86 +0,0 @@
|
|||
package eu.dnetlib.dhp.actionmanager.scholix
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkSaveActionSet {
|
||||
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/save_actionset.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
|
||||
spark.read.load(sourcePath).as[Oaf]
|
||||
.map(o => toActionSet(o))
|
||||
.filter(o => o != null)
|
||||
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,608 +0,0 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import java.nio.charset.CodingErrorAction
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.LocalDate
|
||||
import java.time.chrono.ThaiBuddhistDate
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.regex.Pattern
|
||||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
import scala.language.postfixOps
|
||||
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
|
||||
|
||||
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
|
||||
|
||||
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
|
||||
val REL_TYPE_VALUE:String = "resultResult"
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
|
||||
val subRelTypeMapping: Map[String,(String,String)] = Map(
|
||||
"References" ->("IsReferencedBy","relationship"),
|
||||
"IsSupplementTo" ->("IsSupplementedBy","supplement"),
|
||||
"IsPartOf" ->("HasPart","part"),
|
||||
"HasPart" ->("IsPartOf","part"),
|
||||
"IsVersionOf" ->("HasVersion","version"),
|
||||
"HasVersion" ->("IsVersionOf","version"),
|
||||
"IsIdenticalTo" ->("IsIdenticalTo","relationship"),
|
||||
"IsPreviousVersionOf" ->("IsNewVersionOf","version"),
|
||||
"IsContinuedBy" ->("Continues","relationship"),
|
||||
"Continues" ->("IsContinuedBy","relationship"),
|
||||
"IsNewVersionOf" ->("IsPreviousVersionOf","version"),
|
||||
"IsSupplementedBy" ->("IsSupplementTo","supplement"),
|
||||
"IsDocumentedBy" ->("Documents","relationship"),
|
||||
"IsSourceOf" ->("IsDerivedFrom","relationship"),
|
||||
"Cites" ->("IsCitedBy","citation"),
|
||||
"IsCitedBy" ->("Cites","citation"),
|
||||
"IsDerivedFrom" ->("IsSourceOf","relationship"),
|
||||
"IsVariantFormOf" ->("IsDerivedFrom","version"),
|
||||
"IsReferencedBy" ->("References","relationship"),
|
||||
"IsObsoletedBy" ->("IsNewVersionOf","version"),
|
||||
"Reviews" ->("IsReviewedBy","review"),
|
||||
"Documents" ->("IsDocumentedBy","relationship"),
|
||||
"IsCompiledBy" ->("Compiles","relationship"),
|
||||
"Compiles" ->("IsCompiledBy","relationship"),
|
||||
"IsReviewedBy" ->("Reviews","review")
|
||||
)
|
||||
|
||||
implicit val codec: Codec = Codec("UTF-8")
|
||||
codec.onMalformedInput(CodingErrorAction.REPLACE)
|
||||
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
|
||||
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
|
||||
|
||||
val j_filter: List[String] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
|
||||
s.lines.toList
|
||||
}
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
|
||||
|
||||
val dataInfo: DataInfo = generateDataInfo("0.9")
|
||||
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
|
||||
|
||||
val hostedByMap: Map[String, HostedByMapType] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(s)
|
||||
json.extract[Map[String, HostedByMapType]]
|
||||
}
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
|
||||
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
|
||||
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||
//M-D-Y
|
||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
||||
//D-M-Y
|
||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
|
||||
def filter_json(json: String): Boolean = {
|
||||
j_filter.exists(f => json.contains(f))
|
||||
}
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
td.isAfter(dt)
|
||||
}
|
||||
|
||||
|
||||
def extract_date(input: String): Option[String] = {
|
||||
val d = Date_regex.map(pattern => {
|
||||
val matcher = pattern.matcher(input)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
}
|
||||
).find(s => s != null)
|
||||
|
||||
if (d.isDefined) {
|
||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||
} catch {
|
||||
case _: Throwable => try {
|
||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
return None
|
||||
}
|
||||
}
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
def fix_thai_date(input:String, format:String) :String = {
|
||||
try {
|
||||
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
|
||||
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
|
||||
LocalDate.from(d).toString
|
||||
} catch {
|
||||
case _: Throwable => ""
|
||||
}
|
||||
}
|
||||
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
|
||||
if (resourceType != null && resourceType.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
}
|
||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
|
||||
}
|
||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
|
||||
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (typeQualifiers == null)
|
||||
return null
|
||||
val i = new Instance
|
||||
i.setInstancetype(typeQualifiers._1)
|
||||
typeQualifiers._2.getClassname match {
|
||||
case "dataset" =>
|
||||
val r = new OafDataset
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "publication" =>
|
||||
val r = new Publication
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "software" =>
|
||||
val r = new Software
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "other" =>
|
||||
val r = new OtherResearchProduct
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def available_date(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
val l: List[String] = for {
|
||||
JObject(dates) <- json \\ "dates"
|
||||
JField("dateType", JString(dateTypes)) <- dates
|
||||
} yield dateTypes
|
||||
|
||||
l.exists(p => p.equalsIgnoreCase("available"))
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* As describe in ticket #6377
|
||||
* when the result come from figshare we need to remove subject
|
||||
* and set Access rights OPEN.
|
||||
*
|
||||
* @param r
|
||||
*/
|
||||
def fix_figshare(r: Result): Unit = {
|
||||
|
||||
if (r.getInstance() != null) {
|
||||
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||
val l: List[StructuredProperty] = List()
|
||||
r.setSubject(l.asJava)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
||||
}
|
||||
|
||||
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
||||
OafMapperUtils.structuredProperty(dt, q, null)
|
||||
}
|
||||
|
||||
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
r.setTarget(targetId)
|
||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
||||
r.setRelClass(relClass)
|
||||
r.setSubRelType(ModelConstants.OUTCOME)
|
||||
r.setCollectedfrom(List(cf).asJava)
|
||||
r.setDataInfo(di)
|
||||
r
|
||||
|
||||
|
||||
}
|
||||
|
||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
|
||||
|
||||
if (match_pattern.isDefined) {
|
||||
val m = match_pattern.get._1
|
||||
val p = match_pattern.get._2
|
||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||
List(
|
||||
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo)
|
||||
// REMOVED INVERSE RELATION since there is a specific method that should generate later
|
||||
// generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
|
||||
)
|
||||
}
|
||||
else
|
||||
List()
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
|
||||
if (filter_json(input))
|
||||
return List()
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||
|
||||
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||
if (doi.isEmpty)
|
||||
return List()
|
||||
|
||||
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
|
||||
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (result == null)
|
||||
return List()
|
||||
|
||||
|
||||
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
|
||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||
result.setPid(List(pid).asJava)
|
||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||
result.setOriginalId(List(doi).asJava)
|
||||
|
||||
val d = new Date(dateOfCollection * 1000)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
|
||||
|
||||
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||
result.setDateoftransformation(ISO8601FORMAT.format(d))
|
||||
result.setDataInfo(dataInfo)
|
||||
|
||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||
|
||||
|
||||
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||
val a = new Author
|
||||
a.setFullname(c.name.orNull)
|
||||
a.setName(c.givenName.orNull)
|
||||
a.setSurname(c.familyName.orNull)
|
||||
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||
a.setPid(c.nameIdentifiers.get.map(ni => {
|
||||
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
|
||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||
}
|
||||
else
|
||||
null
|
||||
|
||||
}
|
||||
)
|
||||
.asJava)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
|
||||
a.setRank(idx + 1)
|
||||
a
|
||||
}
|
||||
|
||||
|
||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||
|
||||
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
|
||||
if (t.titleType.isEmpty) {
|
||||
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||
} else {
|
||||
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
|
||||
}
|
||||
}).asJava)
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
result.setAuthor(authors.asJava)
|
||||
|
||||
val dates = (json \\ "dates").extract[List[DateType]]
|
||||
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
||||
|
||||
val i_date = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
val a_date: Option[String] = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
.find(d => d != null && d.isDefined)
|
||||
.map(d => d.get)
|
||||
|
||||
if (a_date.isDefined) {
|
||||
if(doi.startsWith("10.14457"))
|
||||
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
|
||||
else
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
}
|
||||
if (i_date.isDefined && i_date.get.isDefined) {
|
||||
if(doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
|
||||
}
|
||||
else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
}
|
||||
}
|
||||
else if (publication_year != null) {
|
||||
if(doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
|
||||
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
.filter(d => d._1.isDefined)
|
||||
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
|
||||
.filter(d => d._2 != null)
|
||||
.map(d => generateOAFDate(d._1, d._2)).asJava)
|
||||
|
||||
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||
|
||||
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
||||
).asJava)
|
||||
|
||||
|
||||
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
|
||||
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
||||
|
||||
result.setDescription(
|
||||
descriptions
|
||||
.filter(d => d.description.isDefined).
|
||||
map(d =>
|
||||
OafMapperUtils.field(d.description.get, null)
|
||||
).filter(s => s != null).asJava)
|
||||
|
||||
|
||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||
if (publisher != null)
|
||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||
|
||||
|
||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||
|
||||
if (language != null)
|
||||
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
|
||||
|
||||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
|
||||
|
||||
val accessRights: List[String] = for {
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||
} yield rightsUri
|
||||
|
||||
val aRights: Option[AccessRight] = accessRights.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
}).find(q => q != null).map(q => {
|
||||
val a = new AccessRight
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
|
||||
|
||||
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
|
||||
if (client.isDefined) {
|
||||
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
|
||||
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
|
||||
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||
instance.setAccessright(access_rights_qualifier)
|
||||
instance.setPid(result.getPid)
|
||||
val license = accessRights
|
||||
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
|
||||
if (license.isDefined)
|
||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||
}
|
||||
|
||||
val awardUris: List[String] = for {
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
result.setId(IdentifierFactory.createIdentifier(result))
|
||||
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
|
||||
fix_figshare(result)
|
||||
|
||||
if (result.getId == null)
|
||||
return List()
|
||||
|
||||
if (exportLinks) {
|
||||
val rels: List[RelatedIdentifierType] = for {
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
||||
|
||||
relations = relations ::: generateRelations(rels,result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
|
||||
}
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
}
|
||||
else
|
||||
List(result)
|
||||
}
|
||||
|
||||
private def generateRelations(rels: List[RelatedIdentifierType], id:String, date:String):List[Relation] = {
|
||||
rels
|
||||
.filter(r =>
|
||||
subRelTypeMapping.contains(r.relationType) && (
|
||||
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||
)
|
||||
.map(r => {
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.setDataInfo(dataInfo)
|
||||
|
||||
val subRelType = subRelTypeMapping(r.relationType)._2
|
||||
rel.setRelType(REL_TYPE_VALUE)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(r.relationType)
|
||||
|
||||
val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.setSource(id)
|
||||
rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier,r.relatedIdentifierType))
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
||||
rel
|
||||
}).toList
|
||||
}
|
||||
|
||||
def generateDataInfo(trust: String): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
|
||||
di
|
||||
}
|
||||
|
||||
def generateDSId(input: String): String = {
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|$b::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,64 +0,0 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
|
||||
import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH
|
||||
import eu.dnetlib.dhp.common.Constants.MDSTORE_SIZE_PATH
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object GenerateDataciteDatasetSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
||||
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
val mapper = new ObjectMapper()
|
||||
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
|
||||
log.info("outputBasePath: {}", outputBasePath)
|
||||
val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH"
|
||||
|
||||
spark.read.load(sourcePath).as[DataciteType]
|
||||
.filter(d => d.isActive)
|
||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
|
||||
.filter(d => d != null)
|
||||
.flatMap(i => fixRelations(i)).filter(i => i != null)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
|
||||
val total_items = spark.read.load(targetPath).as[Oaf].count()
|
||||
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
|
||||
}
|
||||
}
|
|
@ -1,394 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import collection.JavaConverters._
|
||||
object BioDBToOAF {
|
||||
|
||||
case class EBILinkItem(id: Long, links: String) {}
|
||||
|
||||
case class EBILinks(relType: String, date: String, title: String, pmid: String, targetPid: String, targetPidType: String, targetUrl: String) {}
|
||||
|
||||
case class UniprotDate(date: String, date_info: String) {}
|
||||
|
||||
case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
|
||||
|
||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
|
||||
val SUBJ_CLASS = "Keywords"
|
||||
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
|
||||
val resolvedURL: Map[String, String] = Map(
|
||||
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
|
||||
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
|
||||
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
|
||||
"onim" -> "https://omim.org/entry/",
|
||||
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
||||
)
|
||||
|
||||
|
||||
val collectedFromMap: Map[String, KeyValue] = {
|
||||
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", "Protein Data Bank")
|
||||
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::c2a591f440598b63d854556beaf01591", "European Nucleotide Archive")
|
||||
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", "NCBI Nucleotide")
|
||||
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|re3data_____::296e1abaf1302897a6838d3588cd0310", "UniProtKB/Swiss-Prot")
|
||||
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
||||
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
|
||||
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
|
||||
val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
||||
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ElsevierCollectedFrom.setDataInfo(DATA_INFO)
|
||||
EBICollectedFrom.setDataInfo(DATA_INFO)
|
||||
pubmedCollectedFrom.setDataInfo(DATA_INFO)
|
||||
enaCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ncbiCollectedFrom.setDataInfo(DATA_INFO)
|
||||
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
|
||||
|
||||
Map(
|
||||
"uniprot" -> UNIPROTCollectedFrom,
|
||||
"pdb" -> PDBCollectedFrom,
|
||||
"elsevier" -> ElsevierCollectedFrom,
|
||||
"ebi" -> EBICollectedFrom,
|
||||
"Springer Nature" -> springerNatureCollectedFrom,
|
||||
"NCBI Nucleotide" -> ncbiCollectedFrom,
|
||||
"European Nucleotide Archive" -> enaCollectedFrom,
|
||||
"Europe PMC" -> pubmedCollectedFrom
|
||||
)
|
||||
}
|
||||
|
||||
def crossrefLinksToOaf(input: String): Oaf = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
|
||||
|
||||
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
||||
|
||||
createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(input.pid.toLowerCase, input.pidType.toLowerCase, input.pidType.toLowerCase, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
|
||||
val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
||||
|
||||
if (input.tilte != null && input.tilte.nonEmpty)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(input.tilte.head, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
|
||||
d.setOriginalId(List(input.pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
|
||||
if (resolvedURL.contains(input.pidType)) {
|
||||
i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
|
||||
}
|
||||
|
||||
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0037", "Clinical Trial", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
else
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
if (input.datasource == null || input.datasource.isEmpty)
|
||||
return null
|
||||
|
||||
val ds = input.datasource.head
|
||||
d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
|
||||
i.setCollectedfrom(collectedFromMap(ds))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
if (input.authors != null && input.authors.nonEmpty) {
|
||||
val authors = input.authors.map(a => {
|
||||
val authorOAF = new Author
|
||||
authorOAF.setFullname(a)
|
||||
authorOAF
|
||||
})
|
||||
d.setAuthor(authors.asJava)
|
||||
}
|
||||
if (input.date != null && input.date.nonEmpty) {
|
||||
val dt = input.date.head
|
||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
|
||||
def uniprotToOAF(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pid = (json \ "pid").extract[String]
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(pid, "uniprot", "uniprot", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
|
||||
d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
|
||||
|
||||
val title: String = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title != null)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
|
||||
d.setOriginalId(List(pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("uniprot"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val dates: List[UniprotDate] = for {
|
||||
JObject(dateOBJ) <- json \ "dates"
|
||||
JField("date", JString(date)) <- dateOBJ
|
||||
JField("date_info", JString(date_info)) <- dateOBJ
|
||||
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
|
||||
|
||||
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
||||
|
||||
|
||||
if (subjects != null) {
|
||||
d.setSubject(
|
||||
subjects.map(s =>
|
||||
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
||||
).asJava)
|
||||
}
|
||||
var i_date: Option[UniprotDate] = None
|
||||
|
||||
if (dates.nonEmpty) {
|
||||
i_date = dates.find(d => d.date_info.contains("entry version"))
|
||||
if (i_date.isDefined) {
|
||||
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
val relevant_dates: List[StructuredProperty] = dates.filter(d => !d.date_info.contains("entry version"))
|
||||
.map(date => OafMapperUtils.structuredProperty(date.date, ModelConstants.UNKNOWN, ModelConstants.UNKNOWN, ModelConstants.DNET_DATACITE_DATE, ModelConstants.DNET_DATACITE_DATE, DATA_INFO))
|
||||
if (relevant_dates != null && relevant_dates.nonEmpty)
|
||||
d.setRelevantdate(relevant_dates.asJava)
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
|
||||
|
||||
val references_pmid: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField("PubMed", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
val references_doi: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField(" DOI", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
|
||||
if (references_pmid != null && references_pmid.nonEmpty) {
|
||||
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
|
||||
rel.getCollectedfrom
|
||||
List(d, rel)
|
||||
}
|
||||
else if (references_doi != null && references_doi.nonEmpty) {
|
||||
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
|
||||
List(d, rel)
|
||||
}
|
||||
else
|
||||
List(d)
|
||||
}
|
||||
|
||||
|
||||
def generate_unresolved_id(pid: String, pidType: String): String = {
|
||||
s"unresolved::$pid::$pidType"
|
||||
}
|
||||
|
||||
|
||||
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
|
||||
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
rel.setDataInfo(DATA_INFO)
|
||||
|
||||
rel.setRelType(ModelConstants.RESULT_RESULT)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(relClass)
|
||||
|
||||
rel.setSource(sourceId)
|
||||
rel.setTarget(s"unresolved::$pid::$pidType")
|
||||
|
||||
|
||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.getTarget.startsWith("unresolved")
|
||||
rel.setCollectedfrom(List(collectedFrom).asJava)
|
||||
rel
|
||||
|
||||
}
|
||||
|
||||
|
||||
def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
|
||||
createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
|
||||
}
|
||||
|
||||
|
||||
def pdbTOOaf(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pdb = (json \ "pdb").extract[String].toLowerCase
|
||||
|
||||
if (pdb.isEmpty)
|
||||
return List()
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(pdb, "pdb", "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
|
||||
d.setOriginalId(List(pdb).asJava)
|
||||
|
||||
val title = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title == null)
|
||||
return List()
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
|
||||
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
||||
|
||||
if (authors != null) {
|
||||
val convertedAuthors = authors.zipWithIndex.map { a =>
|
||||
|
||||
val res = new Author
|
||||
res.setFullname(a._1)
|
||||
res.setRank(a._2 + 1)
|
||||
res
|
||||
}
|
||||
|
||||
d.setAuthor(convertedAuthors.asJava)
|
||||
}
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("pdb"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val pmid = (json \ "pmid").extractOrElse[String](null)
|
||||
|
||||
if (pmid != null)
|
||||
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null))
|
||||
else
|
||||
List(d)
|
||||
}
|
||||
|
||||
|
||||
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val pmid = (json \ "publication" \ "pmid").extract[String]
|
||||
val links = (json \ "links").extract[JObject]
|
||||
EBILinkItem(pmid.toLong, compact(render(links)))
|
||||
}
|
||||
|
||||
|
||||
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
||||
|
||||
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
|
||||
|
||||
}
|
||||
|
||||
|
||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "request" \ "id").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("RelationshipType", JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate", JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("IDURL", JString(idUrl)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(relation, GraphCleaningFunctions.cleanDate(publicationDate), title, pmid, id, idScheme, idUrl)
|
||||
}
|
||||
|
||||
|
||||
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
||||
val d = new Dataset
|
||||
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setTitle(List(OafMapperUtils.structuredProperty(input.title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)).asJava)
|
||||
|
||||
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
||||
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
||||
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
||||
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(input.targetPid.toLowerCase, input.targetPidType.toLowerCase, "Protein Data Bank Identifier", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(input.targetUrl).asJava)
|
||||
i.setInstancetype(OafMapperUtils.qualifier("0046", "Bioentity", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("ebi"))
|
||||
d.setInstance(List(i).asJava)
|
||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO))
|
||||
|
||||
List(d, createRelation(input.pmid, "pmid", d.getId, collectedFromMap("ebi"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, GraphCleaningFunctions.cleanDate(input.date)))
|
||||
}
|
||||
}
|
|
@ -1,146 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import scala.xml.MetaData
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param xml
|
||||
*/
|
||||
class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||
|
||||
var currentArticle:PMArticle = generateNextArticle()
|
||||
|
||||
override def hasNext: Boolean = currentArticle!= null
|
||||
|
||||
override def next(): PMArticle = {
|
||||
val tmp = currentArticle
|
||||
currentArticle = generateNextArticle()
|
||||
tmp
|
||||
}
|
||||
|
||||
def extractAttributes(attrs:MetaData, key:String):String = {
|
||||
|
||||
val res = attrs.get(key)
|
||||
if (res.isDefined) {
|
||||
val s =res.get
|
||||
if (s != null && s.nonEmpty)
|
||||
s.head.text
|
||||
else
|
||||
null
|
||||
}
|
||||
else null
|
||||
}
|
||||
|
||||
|
||||
def validate_Date(year:String, month:String, day:String):String = {
|
||||
try {
|
||||
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
|
||||
|
||||
} catch {
|
||||
case _: Throwable =>null
|
||||
}
|
||||
}
|
||||
|
||||
def generateNextArticle():PMArticle = {
|
||||
|
||||
|
||||
var currentSubject:PMSubject = null
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currentGrant: PMGrant = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType:String = null
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
while (xml.hasNext) {
|
||||
xml.next match {
|
||||
case EvElemStart(_, label, attrs, _) =>
|
||||
currNode = label
|
||||
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "PublicationType" | "DescriptorName" =>
|
||||
currentSubject = new PMSubject
|
||||
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs,"IdType")
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" => if (currentArticle.getDate== null)
|
||||
currentArticle.setDate(validate_Date(currentYear,currentMonth,currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" =>currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode!= null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle==null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription==null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ArticleId" => if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume( text.trim)
|
||||
case "Issue" => currentJournal.setIssue (text.trim)
|
||||
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
}
|
||||
case "ForeName" => if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle==null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rtn",
|
||||
"paramLongName": "resultTableName",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bsp",
|
||||
"paramLongName": "bipScorePath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -27,4 +27,4 @@
|
|||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</configuration>
|
|
@ -1,9 +1,5 @@
|
|||
<workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>inputPath</name>
|
||||
<description>the input path of the resources to be extended</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>bipScorePath</name>
|
||||
|
@ -13,8 +9,61 @@
|
|||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
<start to="deleteoutputpath"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
|
@ -30,14 +79,8 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="atomicactions">
|
||||
<path start="atomicactions_publication"/>
|
||||
<path start="atomicactions_dataset"/>
|
||||
<path start="atomicactions_orp"/>
|
||||
<path start="atomicactions_software"/>
|
||||
</fork>
|
||||
|
||||
<action name="atomicactions_publication">
|
||||
<action name="atomicactions">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
|
@ -54,113 +97,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for datasets</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_orp">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for orp</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for software</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_aa" to="collectandsave"/>
|
||||
|
||||
<action name="collectandsave">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>saves all the aa produced for the several types of results in the as output path</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}</arg>
|
||||
<arg>--inputPath</arg><arg>${bipScorePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -19,15 +19,9 @@
|
|||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "hnn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the path used to store the HostedByMap",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "cfn",
|
||||
"paramLongName": "classForName",
|
||||
"paramDescription": "the path used to store the HostedByMap",
|
||||
"paramRequired": true
|
||||
"paramName": "d",
|
||||
"paramLongName": "delimiter",
|
||||
"paramDescription": "the delimiter if different from the default one (,)",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -79,6 +79,7 @@
|
|||
<fork name="prepareInfo">
|
||||
<path start="prepareBip"/>
|
||||
<path start="getFOS"/>
|
||||
<path start="getSDG"/>
|
||||
</fork>
|
||||
|
||||
<action name="prepareBip">
|
||||
|
@ -106,17 +107,30 @@
|
|||
</action>
|
||||
|
||||
<action name="getFOS">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetFOSData</main-class>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Gets Data from FOS csv file</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetFOSSparkJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${fosPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/input/fos</arg>
|
||||
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel</arg>
|
||||
</java>
|
||||
</spark>
|
||||
<ok to="prepareFos"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepareFos">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -142,6 +156,55 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="getSDG">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Gets Data from SDG csv file</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetSDGSparkJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sdgPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/input/sdg</arg>
|
||||
</spark>
|
||||
<ok to="prepareSDG"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="prepareSDG">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the unresolved from FOS!</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareSDGSparkJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/input/sdg</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="join"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<join name="join" to="produceUnresolved"/>
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
[
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the zipped opencitations file",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "delimiter",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "if",
|
||||
"paramLongName": "inputFile",
|
||||
"paramDescription": "the hdfs name node",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -26,6 +26,7 @@
|
|||
<switch>
|
||||
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
|
||||
<case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
|
||||
<case to="read">${wf:conf('resumeFrom') eq 'ReadContent'}</case>
|
||||
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -60,6 +61,32 @@
|
|||
<arg>--inputFile</arg><arg>${inputFile}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
</java>
|
||||
<ok to="read"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="read">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the AS for OC</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.opencitations.ReadCOCI</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}/COCI</arg>
|
||||
<arg>--outputPath</arg><arg>${workingPath}/COCI_JSON</arg>
|
||||
<arg>--delimiter</arg><arg>${delimiter}</arg>
|
||||
<arg>--inputFile</arg><arg>${inputFileCoci}</arg>
|
||||
</spark>
|
||||
<ok to="create_actionset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
@ -81,7 +108,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
|
||||
<arg>--inputPath</arg><arg>${workingPath}/COCI_JSON</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
from urllib.request import urlopen
|
||||
import json
|
||||
|
||||
|
||||
def retrieve_datacite_clients(base_url):
|
||||
datacite_clients = {}
|
||||
while base_url is not None:
|
||||
with urlopen(base_url) as response:
|
||||
print(f"requesting {base_url}")
|
||||
response_content = response.read()
|
||||
data = json.loads(response_content)
|
||||
if 'data' in data and len(data['data'])>0:
|
||||
for item in data['data']:
|
||||
datacite_clients[item['id'].lower()]= item['attributes']['re3data'].lower().replace("https://doi.org/","")
|
||||
base_url = data['links']['next']
|
||||
else:
|
||||
base_url = None
|
||||
return datacite_clients
|
||||
|
||||
|
||||
def retrieve_r3data(start_url):
|
||||
r3data_clients = {}
|
||||
page_number = 1
|
||||
base_url = start_url
|
||||
while base_url is not None:
|
||||
with urlopen(base_url) as response:
|
||||
print(f"requesting {base_url}")
|
||||
response_content = response.read()
|
||||
data = json.loads(response_content)
|
||||
if 'data' in data and len(data['data'])>0:
|
||||
for item in data['data']:
|
||||
r3data_clients[item['id'].lower()]= dict(
|
||||
openaire_id= "re3data_____::"+item['attributes']['re3dataId'].lower(),
|
||||
official_name=item['attributes']['repositoryName']
|
||||
)
|
||||
page_number +=1
|
||||
base_url = f"{start_url}&page[number]={page_number}"
|
||||
else:
|
||||
base_url = None
|
||||
return r3data_clients
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
base_url ="https://api.datacite.org/clients?query=re3data_id:*&page[size]=250"
|
||||
|
||||
dc = retrieve_datacite_clients(base_url)
|
||||
r3 = retrieve_r3data("https://api.datacite.org/re3data?page[size]=250")
|
||||
|
||||
result = {}
|
||||
|
||||
for item in dc:
|
||||
res = dc[item].lower()
|
||||
if res not in r3:
|
||||
print(f"missing {res} for {item} in dictionary")
|
||||
else:
|
||||
result[item.upper()]= dict(openaire_id=r3[res]["openaire_id"],datacite_name=r3[res]["official_name"], official_name=r3[res]["official_name"] )
|
||||
|
||||
|
||||
with open('hostedBy_map.json', 'w', encoding='utf8') as json_file:
|
||||
json.dump(result, json_file, ensure_ascii=False, indent=1)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,19 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
|
@ -0,0 +1,62 @@
|
|||
<workflow-app name="Retrieve Scholix Update" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path of scholix graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datacitePath</name>
|
||||
<description>the datacite native path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingSupportPath</name>
|
||||
<description>the working Support path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>updateDS</name>
|
||||
<value>false</value>
|
||||
<description>The transformation Rule to apply</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="RetrieveDeltaDatacite"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="RetrieveDeltaDatacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>New Update from Datacite to Scholix</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkRetrieveDataciteDelta</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=6000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--datacitePath</arg><arg>${datacitePath}</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--workingSupportPath</arg><arg>${workingSupportPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--updateDS</arg><arg>${updateDS}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -0,0 +1,41 @@
|
|||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source mdstore path",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "datacitePath",
|
||||
"paramDescription": "the datacite native path",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingSupportPath",
|
||||
"paramDescription": "the working Support path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "the isLookup URL",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "u",
|
||||
"paramLongName": "updateDS",
|
||||
"paramDescription": "Need to regenerate all support Dataset",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
|
@ -7,3 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
|||
# A1 uses PatternLayout.
|
||||
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||
|
||||
log4j.logger.org.apache.spark=FATAL
|
||||
log4j.logger.org.spark_project=FATAL
|
||||
|
|
|
@ -1,20 +1,20 @@
|
|||
package eu.dnetlib.dhp.collection
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode}
|
||||
|
||||
object CollectionUtils {
|
||||
|
||||
/**
|
||||
* This method in pipeline to the transformation phase,
|
||||
* generates relations in both verse, typically it should be a phase of flatMap
|
||||
*
|
||||
* @param i input OAF
|
||||
* @return
|
||||
* If the input OAF is an entity -> List(i)
|
||||
* If the input OAF is a relation -> List(relation, inverseRelation)
|
||||
*
|
||||
*/
|
||||
/** This method in pipeline to the transformation phase,
|
||||
* generates relations in both verse, typically it should be a phase of flatMap
|
||||
*
|
||||
* @param i input OAF
|
||||
* @return
|
||||
* If the input OAF is an entity -> List(i)
|
||||
* If the input OAF is a relation -> List(relation, inverseRelation)
|
||||
*/
|
||||
|
||||
def fixRelations(i: Oaf): List[Oaf] = {
|
||||
if (i.isInstanceOf[OafEntity])
|
||||
|
@ -46,4 +46,18 @@ object CollectionUtils {
|
|||
List()
|
||||
}
|
||||
|
||||
def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = {
|
||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
val mapper = new ObjectMapper
|
||||
|
||||
dataset
|
||||
.flatMap(i => CollectionUtils.fixRelations(i))
|
||||
.filter(i => i != null)
|
||||
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(targetPath)
|
||||
}
|
||||
|
||||
}
|
|
@ -6,7 +6,6 @@ import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
|
|||
import org.apache.http.entity.StringEntity
|
||||
import org.apache.http.impl.client.HttpClientBuilder
|
||||
|
||||
|
||||
abstract class AbstractRestClient extends Iterator[String] {
|
||||
|
||||
var buffer: List[String] = List()
|
||||
|
@ -16,12 +15,10 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
|
||||
var complete: Boolean = false
|
||||
|
||||
|
||||
def extractInfo(input: String): Unit
|
||||
|
||||
protected def getBufferData(): Unit
|
||||
|
||||
|
||||
def doHTTPGETRequest(url: String): String = {
|
||||
val httpGet = new HttpGet(url)
|
||||
doHTTPRequest(httpGet)
|
||||
|
@ -43,7 +40,6 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
buffer.nonEmpty && current_index < buffer.size
|
||||
}
|
||||
|
||||
|
||||
override def next(): String = {
|
||||
val next_item: String = buffer(current_index)
|
||||
current_index = current_index + 1
|
||||
|
@ -52,13 +48,14 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
next_item
|
||||
}
|
||||
|
||||
|
||||
private def doHTTPRequest[A <: HttpUriRequest](r: A): String = {
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -69,8 +66,7 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
@ -87,4 +83,4 @@ abstract class AbstractRestClient extends Iterator[String] {
|
|||
}
|
||||
|
||||
getBufferData()
|
||||
}
|
||||
}
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.datacite
|
|||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import org.json4s.{DefaultFormats, JValue}
|
||||
|
||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
|
||||
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1) extends AbstractRestClient {
|
||||
|
||||
override def extractInfo(input: String): Unit = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
@ -16,16 +16,18 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -
|
|||
current_index = 0
|
||||
}
|
||||
|
||||
def get_url():String ={
|
||||
val to = if (until> 0) s"$until" else "*"
|
||||
def get_url(): String = {
|
||||
val to = if (until > 0) s"$until" else "*"
|
||||
s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]"
|
||||
|
||||
}
|
||||
|
||||
override def getBufferData(): Unit = {
|
||||
if (!complete) {
|
||||
val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
|
||||
val response =
|
||||
if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
|
||||
else doHTTPGETRequest(get_url())
|
||||
extractInfo(response)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,278 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
|
||||
|
||||
import java.io.InputStream
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.Locale
|
||||
import java.util.regex.Pattern
|
||||
import scala.io.Source
|
||||
|
||||
/** This class represent the dataModel of the input Dataset of Datacite
|
||||
* @param doi THE DOI
|
||||
* @param timestamp timestamp of last update date
|
||||
* @param isActive the record is active or deleted
|
||||
* @param json the json native records
|
||||
*/
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
/*
|
||||
The following class are utility class used for the mapping from
|
||||
json datacite to OAF Shema
|
||||
*/
|
||||
case class RelatedIdentifierType(
|
||||
relationType: String,
|
||||
relatedIdentifier: String,
|
||||
relatedIdentifierType: String
|
||||
) {}
|
||||
|
||||
case class NameIdentifiersType(
|
||||
nameIdentifierScheme: Option[String],
|
||||
schemeUri: Option[String],
|
||||
nameIdentifier: Option[String]
|
||||
) {}
|
||||
|
||||
case class CreatorType(
|
||||
nameType: Option[String],
|
||||
nameIdentifiers: Option[List[NameIdentifiersType]],
|
||||
name: Option[String],
|
||||
familyName: Option[String],
|
||||
givenName: Option[String],
|
||||
affiliation: Option[List[String]]
|
||||
) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(
|
||||
funderIdentifierType: Option[String],
|
||||
awardTitle: Option[String],
|
||||
awardUri: Option[String],
|
||||
funderName: Option[String],
|
||||
funderIdentifier: Option[String],
|
||||
awardNumber: Option[String]
|
||||
) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class OAFRelations(relation: String, inverse: String, relType: String)
|
||||
|
||||
class DataciteModelConstants extends Serializable {}
|
||||
|
||||
object DataciteModelConstants {
|
||||
|
||||
val REL_TYPE_VALUE: String = "resultResult"
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
val DATACITE_NAME = "Datacite"
|
||||
val dataInfo: DataInfo = dataciteDataInfo("0.9")
|
||||
|
||||
val DATACITE_COLLECTED_FROM: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
|
||||
|
||||
val subRelTypeMapping: Map[String, OAFRelations] = Map(
|
||||
ModelConstants.REFERENCES -> OAFRelations(
|
||||
ModelConstants.REFERENCES,
|
||||
ModelConstants.IS_REFERENCED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_REFERENCED_BY -> OAFRelations(
|
||||
ModelConstants.IS_REFERENCED_BY,
|
||||
ModelConstants.REFERENCES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.SUPPLEMENT
|
||||
),
|
||||
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
ModelConstants.IS_SUPPLEMENTED_BY,
|
||||
ModelConstants.SUPPLEMENT
|
||||
),
|
||||
ModelConstants.HAS_PART -> OAFRelations(
|
||||
ModelConstants.HAS_PART,
|
||||
ModelConstants.IS_PART_OF,
|
||||
ModelConstants.PART
|
||||
),
|
||||
ModelConstants.IS_PART_OF -> OAFRelations(
|
||||
ModelConstants.IS_PART_OF,
|
||||
ModelConstants.HAS_PART,
|
||||
ModelConstants.PART
|
||||
),
|
||||
ModelConstants.IS_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_VERSION_OF,
|
||||
ModelConstants.HAS_VERSION,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.HAS_VERSION -> OAFRelations(
|
||||
ModelConstants.HAS_VERSION,
|
||||
ModelConstants.IS_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(
|
||||
ModelConstants.IS_IDENTICAL_TO,
|
||||
ModelConstants.IS_IDENTICAL_TO,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_CONTINUED_BY -> OAFRelations(
|
||||
ModelConstants.IS_CONTINUED_BY,
|
||||
ModelConstants.CONTINUES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.CONTINUES -> OAFRelations(
|
||||
ModelConstants.CONTINUES,
|
||||
ModelConstants.IS_CONTINUED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_NEW_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations(
|
||||
ModelConstants.IS_PREVIOUS_VERSION_OF,
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_SOURCE_OF -> OAFRelations(
|
||||
ModelConstants.IS_SOURCE_OF,
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_DERIVED_FROM -> OAFRelations(
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.IS_SOURCE_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.CITES -> OAFRelations(
|
||||
ModelConstants.CITES,
|
||||
ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.CITATION
|
||||
),
|
||||
ModelConstants.IS_CITED_BY -> OAFRelations(
|
||||
ModelConstants.IS_CITED_BY,
|
||||
ModelConstants.CITES,
|
||||
ModelConstants.CITATION
|
||||
),
|
||||
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(
|
||||
ModelConstants.IS_VARIANT_FORM_OF,
|
||||
ModelConstants.IS_DERIVED_FROM,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(
|
||||
ModelConstants.IS_OBSOLETED_BY,
|
||||
ModelConstants.IS_NEW_VERSION_OF,
|
||||
ModelConstants.VERSION
|
||||
),
|
||||
ModelConstants.REVIEWS -> OAFRelations(
|
||||
ModelConstants.REVIEWS,
|
||||
ModelConstants.IS_REVIEWED_BY,
|
||||
ModelConstants.REVIEW
|
||||
),
|
||||
ModelConstants.IS_REVIEWED_BY -> OAFRelations(
|
||||
ModelConstants.IS_REVIEWED_BY,
|
||||
ModelConstants.REVIEWS,
|
||||
ModelConstants.REVIEW
|
||||
),
|
||||
ModelConstants.DOCUMENTS -> OAFRelations(
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(
|
||||
ModelConstants.IS_DOCUMENTED_BY,
|
||||
ModelConstants.DOCUMENTS,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.COMPILES -> OAFRelations(
|
||||
ModelConstants.COMPILES,
|
||||
ModelConstants.IS_COMPILED_BY,
|
||||
ModelConstants.RELATIONSHIP
|
||||
),
|
||||
ModelConstants.IS_COMPILED_BY -> OAFRelations(
|
||||
ModelConstants.IS_COMPILED_BY,
|
||||
ModelConstants.COMPILES,
|
||||
ModelConstants.RELATIONSHIP
|
||||
)
|
||||
)
|
||||
|
||||
val datacite_filter: List[String] = {
|
||||
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
|
||||
require(stream != null)
|
||||
Source.fromInputStream(stream).getLines().toList
|
||||
}
|
||||
|
||||
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
trust
|
||||
)
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern(
|
||||
"[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]",
|
||||
Locale.ENGLISH
|
||||
)
|
||||
|
||||
val df_it: DateTimeFormatter =
|
||||
DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda__h2020::"
|
||||
),
|
||||
(
|
||||
Pattern.compile(
|
||||
"(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)",
|
||||
Pattern.MULTILINE | Pattern.CASE_INSENSITIVE
|
||||
),
|
||||
"40|corda_______::"
|
||||
)
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile(
|
||||
"(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//M-D-Y
|
||||
Pattern.compile(
|
||||
"((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//D-M-Y
|
||||
Pattern.compile(
|
||||
"(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})",
|
||||
Pattern.MULTILINE
|
||||
),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
}
|
|
@ -0,0 +1,652 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.datacite.DataciteModelConstants._
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.LocalDate
|
||||
import java.time.chrono.ThaiBuddhistDate
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.{Date, Locale}
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
|
||||
case class HostedByMapType(
|
||||
openaire_id: String,
|
||||
datacite_name: String,
|
||||
official_name: String,
|
||||
similarity: Option[Float]
|
||||
) {}
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
val unknown_repository: HostedByMapType = HostedByMapType(
|
||||
ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID,
|
||||
ModelConstants.UNKNOWN_REPOSITORY.getValue,
|
||||
ModelConstants.UNKNOWN_REPOSITORY.getValue,
|
||||
Some(1.0f)
|
||||
)
|
||||
|
||||
val hostedByMap: Map[String, HostedByMapType] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(s)
|
||||
json.extract[Map[String, HostedByMapType]]
|
||||
}
|
||||
|
||||
/** This method should skip record if json contains invalid text
|
||||
* defined in gile datacite_filter
|
||||
*
|
||||
* @param json
|
||||
* @return True if the record should be skipped
|
||||
*/
|
||||
def skip_record(json: String): Boolean = {
|
||||
datacite_filter.exists(f => json.contains(f))
|
||||
}
|
||||
|
||||
@deprecated("this method will be removed", "dhp")
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
td.isAfter(dt)
|
||||
}
|
||||
|
||||
def extract_date(input: String): Option[String] = {
|
||||
val d = Date_regex
|
||||
.map(pattern => {
|
||||
val matcher = pattern.matcher(input)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
})
|
||||
.find(s => s != null)
|
||||
|
||||
if (d.isDefined) {
|
||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
return None
|
||||
}
|
||||
}
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
def fix_thai_date(input: String, format: String): String = {
|
||||
try {
|
||||
val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
|
||||
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
|
||||
LocalDate.from(d).toString
|
||||
} catch {
|
||||
case _: Throwable => ""
|
||||
}
|
||||
}
|
||||
|
||||
def getTypeQualifier(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
schemaOrg: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): (Qualifier, Qualifier) = {
|
||||
if (resourceType != null && resourceType.nonEmpty) {
|
||||
val typeQualifier =
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
if (typeQualifier != null)
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
}
|
||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||
val typeQualifier =
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||
if (typeQualifier != null)
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
resourceTypeGeneral
|
||||
)
|
||||
if (typeQualifier != null)
|
||||
return (
|
||||
typeQualifier,
|
||||
vocabularies.getSynonymAsQualifier(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
typeQualifier.getClassid
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
def getResult(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
schemaOrg: String,
|
||||
vocabularies: VocabularyGroup
|
||||
): Result = {
|
||||
val typeQualifiers: (Qualifier, Qualifier) =
|
||||
getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (typeQualifiers == null)
|
||||
return null
|
||||
val i = new Instance
|
||||
i.setInstancetype(typeQualifiers._1)
|
||||
typeQualifiers._2.getClassname match {
|
||||
case "dataset" =>
|
||||
val r = new OafDataset
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "publication" =>
|
||||
val r = new Publication
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "software" =>
|
||||
val r = new Software
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "other" =>
|
||||
val r = new OtherResearchProduct
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
def available_date(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
val l: List[String] = for {
|
||||
JObject(dates) <- json \\ "dates"
|
||||
JField("dateType", JString(dateTypes)) <- dates
|
||||
} yield dateTypes
|
||||
|
||||
l.exists(p => p.equalsIgnoreCase("available"))
|
||||
|
||||
}
|
||||
|
||||
/** As describe in ticket #6377
|
||||
* when the result come from figshare we need to remove subject
|
||||
* and set Access rights OPEN.
|
||||
*
|
||||
* @param r
|
||||
*/
|
||||
def fix_figshare(r: Result): Unit = {
|
||||
|
||||
if (r.getInstance() != null) {
|
||||
val hosted_by_figshare = r
|
||||
.getInstance()
|
||||
.asScala
|
||||
.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
|
||||
val l: List[StructuredProperty] = List()
|
||||
r.setSubject(l.asJava)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
|
||||
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
|
||||
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
|
||||
}
|
||||
|
||||
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
||||
OafMapperUtils.structuredProperty(dt, q, null)
|
||||
}
|
||||
|
||||
def generateRelation(
|
||||
sourceId: String,
|
||||
targetId: String,
|
||||
relClass: String,
|
||||
cf: KeyValue,
|
||||
di: DataInfo
|
||||
): Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
r.setTarget(targetId)
|
||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
||||
r.setRelClass(relClass)
|
||||
r.setSubRelType(ModelConstants.OUTCOME)
|
||||
r.setCollectedfrom(List(cf).asJava)
|
||||
r.setDataInfo(di)
|
||||
r
|
||||
|
||||
}
|
||||
|
||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
|
||||
|
||||
if (match_pattern.isDefined) {
|
||||
val m = match_pattern.get._1
|
||||
val p = match_pattern.get._2
|
||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
|
||||
} else
|
||||
List()
|
||||
|
||||
}
|
||||
|
||||
def generateOAF(
|
||||
input: String,
|
||||
ts: Long,
|
||||
dateOfCollection: Long,
|
||||
vocabularies: VocabularyGroup,
|
||||
exportLinks: Boolean
|
||||
): List[Oaf] = {
|
||||
if (skip_record(input))
|
||||
return List()
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||
val resourceTypeGeneral =
|
||||
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||
|
||||
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||
if (doi.isEmpty)
|
||||
return List()
|
||||
|
||||
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
|
||||
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (result == null)
|
||||
return List()
|
||||
|
||||
val doi_q = OafMapperUtils.qualifier(
|
||||
"doi",
|
||||
"doi",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES
|
||||
)
|
||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||
result.setPid(List(pid).asJava)
|
||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||
result.setOriginalId(List(doi).asJava)
|
||||
|
||||
val d = new Date(dateOfCollection * 1000)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
|
||||
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||
result.setDateoftransformation(ISO8601FORMAT.format(d))
|
||||
result.setDataInfo(dataInfo)
|
||||
|
||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||
|
||||
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||
val a = new Author
|
||||
a.setFullname(c.name.orNull)
|
||||
a.setName(c.givenName.orNull)
|
||||
a.setSurname(c.familyName.orNull)
|
||||
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||
a.setPid(
|
||||
c.nameIdentifiers.get
|
||||
.map(ni => {
|
||||
val q =
|
||||
if (ni.nameIdentifierScheme.isDefined)
|
||||
vocabularies.getTermAsQualifier(
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ni.nameIdentifierScheme.get.toLowerCase()
|
||||
)
|
||||
else null
|
||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||
} else
|
||||
null
|
||||
|
||||
})
|
||||
.asJava
|
||||
)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(
|
||||
c.affiliation.get
|
||||
.filter(af => af.nonEmpty)
|
||||
.map(af => OafMapperUtils.field(af, dataInfo))
|
||||
.asJava
|
||||
)
|
||||
a.setRank(idx + 1)
|
||||
a
|
||||
}
|
||||
|
||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||
|
||||
result.setTitle(
|
||||
titles
|
||||
.filter(t => t.title.nonEmpty)
|
||||
.map(t => {
|
||||
if (t.titleType.isEmpty) {
|
||||
OafMapperUtils
|
||||
.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
|
||||
} else {
|
||||
OafMapperUtils.structuredProperty(
|
||||
t.title.get,
|
||||
t.titleType.get,
|
||||
t.titleType.get,
|
||||
ModelConstants.DNET_DATACITE_TITLE,
|
||||
ModelConstants.DNET_DATACITE_TITLE,
|
||||
null
|
||||
)
|
||||
}
|
||||
})
|
||||
.asJava
|
||||
)
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
result.setAuthor(authors.asJava)
|
||||
|
||||
val dates = (json \\ "dates").extract[List[DateType]]
|
||||
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
||||
|
||||
val i_date = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
val a_date: Option[String] = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
.find(d => d != null && d.isDefined)
|
||||
.map(d => d.get)
|
||||
|
||||
if (a_date.isDefined) {
|
||||
if (doi.startsWith("10.14457"))
|
||||
result.setEmbargoenddate(
|
||||
OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
else
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
}
|
||||
if (i_date.isDefined && i_date.get.isDefined) {
|
||||
if (doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null)
|
||||
)
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
}
|
||||
} else if (publication_year != null) {
|
||||
if (doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
|
||||
)
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(
|
||||
OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null)
|
||||
)
|
||||
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
}
|
||||
|
||||
result.setRelevantdate(
|
||||
dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
.filter(d => d._1.isDefined)
|
||||
.map(d =>
|
||||
(
|
||||
d._1.get,
|
||||
vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())
|
||||
)
|
||||
)
|
||||
.filter(d => d._2 != null)
|
||||
.map(d => generateOAFDate(d._1, d._2))
|
||||
.asJava
|
||||
)
|
||||
|
||||
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||
|
||||
result.setSubject(
|
||||
subjects
|
||||
.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s.subject.get,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
null
|
||||
)
|
||||
)
|
||||
.asJava
|
||||
)
|
||||
|
||||
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
|
||||
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
||||
|
||||
result.setDescription(
|
||||
descriptions
|
||||
.filter(d => d.description.isDefined)
|
||||
.map(d => OafMapperUtils.field(d.description.get, null))
|
||||
.filter(s => s != null)
|
||||
.asJava
|
||||
)
|
||||
|
||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||
if (publisher != null)
|
||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||
|
||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||
|
||||
if (language != null)
|
||||
result.setLanguage(
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language)
|
||||
)
|
||||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
|
||||
|
||||
val accessRights: List[String] = for {
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||
} yield rightsUri
|
||||
|
||||
val aRights: Option[AccessRight] = accessRights
|
||||
.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
})
|
||||
.find(q => q != null)
|
||||
.map(q => {
|
||||
val a = new AccessRight
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
|
||||
val access_rights_qualifier =
|
||||
if (aRights.isDefined) aRights.get
|
||||
else
|
||||
OafMapperUtils.accessRight(
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.NOT_AVAILABLE,
|
||||
ModelConstants.DNET_ACCESS_MODES,
|
||||
ModelConstants.DNET_ACCESS_MODES
|
||||
)
|
||||
|
||||
if (client.isDefined) {
|
||||
|
||||
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
|
||||
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
|
||||
|
||||
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||
instance.setAccessright(access_rights_qualifier)
|
||||
instance.setPid(result.getPid)
|
||||
val license = accessRights
|
||||
.find(r =>
|
||||
r.startsWith("http") && r.matches(
|
||||
".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"
|
||||
)
|
||||
)
|
||||
if (license.isDefined)
|
||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||
}
|
||||
|
||||
val awardUris: List[String] = for {
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
result.setId(IdentifierFactory.createIdentifier(result))
|
||||
var relations: List[Relation] =
|
||||
awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
|
||||
fix_figshare(result)
|
||||
|
||||
if (result.getId == null)
|
||||
return List()
|
||||
|
||||
if (exportLinks) {
|
||||
val rels: List[RelatedIdentifierType] = for {
|
||||
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
|
||||
JField("relationType", JString(relationType)) <- relIdentifier
|
||||
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
|
||||
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
|
||||
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
|
||||
|
||||
relations = relations ::: generateRelations(
|
||||
rels,
|
||||
result.getId,
|
||||
if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null
|
||||
)
|
||||
}
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
} else
|
||||
List(result)
|
||||
}
|
||||
|
||||
private def generateRelations(
|
||||
rels: List[RelatedIdentifierType],
|
||||
id: String,
|
||||
date: String
|
||||
): List[Relation] = {
|
||||
rels
|
||||
.filter(r =>
|
||||
subRelTypeMapping
|
||||
.contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
|
||||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
|
||||
)
|
||||
.map(r => {
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.setDataInfo(dataInfo)
|
||||
|
||||
val subRelType = subRelTypeMapping(r.relationType).relType
|
||||
rel.setRelType(REL_TYPE_VALUE)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(r.relationType)
|
||||
|
||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.setSource(id)
|
||||
rel.setTarget(
|
||||
DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType)
|
||||
)
|
||||
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
|
||||
rel
|
||||
})
|
||||
}
|
||||
|
||||
def generateDSId(input: String): String = {
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|$b::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"SourcePath is '$sourcePath'")
|
||||
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
|
||||
log.info(s"exportLinks is '$exportLinks'")
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
require(vocabularies != null)
|
||||
|
||||
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
|
||||
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
|
||||
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
|
||||
log.info(s"outputBasePath is '$outputBasePath'")
|
||||
val targetPath = s"$outputBasePath$MDSTORE_DATA_PATH"
|
||||
log.info(s"targetPath is '$targetPath'")
|
||||
|
||||
generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark)
|
||||
|
||||
reportTotalSize(targetPath, outputBasePath)
|
||||
}
|
||||
|
||||
/** For working with MDStore we need to store in a file on hdfs the size of
|
||||
* the current dataset
|
||||
* @param targetPath
|
||||
* @param outputBasePath
|
||||
*/
|
||||
def reportTotalSize(targetPath: String, outputBasePath: String): Unit = {
|
||||
val total_items = spark.read.text(targetPath).count()
|
||||
writeHdfsFile(
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
s"$total_items",
|
||||
outputBasePath + MDSTORE_SIZE_PATH
|
||||
)
|
||||
}
|
||||
|
||||
/** Generate the transformed and cleaned OAF Dataset from the native one
|
||||
*
|
||||
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
|
||||
* @param exportLinks If true it generates unresolved links
|
||||
* @param vocabularies vocabularies for cleaning
|
||||
* @param targetPath the targetPath of the result Dataset
|
||||
*/
|
||||
def generateDataciteDataset(
|
||||
sourcePath: String,
|
||||
exportLinks: Boolean,
|
||||
vocabularies: VocabularyGroup,
|
||||
targetPath: String,
|
||||
spark: SparkSession
|
||||
): Unit = {
|
||||
require(spark != null)
|
||||
import spark.implicits._
|
||||
|
||||
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
|
||||
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
CollectionUtils.saveDataset(
|
||||
spark.read
|
||||
.load(sourcePath)
|
||||
.as[DataciteType]
|
||||
.filter(d => d.isActive)
|
||||
.flatMap(d =>
|
||||
DataciteToOAFTransformation
|
||||
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)
|
||||
)
|
||||
.filter(d => d != null),
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
object GenerateDataciteDatasetSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new GenerateDataciteDatasetSpark(
|
||||
"/eu/dnetlib/dhp/datacite/generate_dataset_params.json",
|
||||
args,
|
||||
log
|
||||
).initialize().run()
|
||||
}
|
||||
}
|
|
@ -22,7 +22,6 @@ object ImportDatacite {
|
|||
|
||||
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
|
||||
|
||||
|
||||
def convertAPIStringToDataciteItem(input: String): DataciteType = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
|
@ -32,14 +31,26 @@ object ImportDatacite {
|
|||
|
||||
val timestamp_string = (json \ "attributes" \ "updated").extract[String]
|
||||
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
|
||||
DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
|
||||
DataciteType(
|
||||
doi = doi,
|
||||
timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
|
||||
isActive = isActive,
|
||||
json = input
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
|
||||
)
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
|
||||
|
@ -60,7 +71,8 @@ object ImportDatacite {
|
|||
val spkipImport = parser.get("skipImport")
|
||||
log.info(s"skipImport is $spkipImport")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder()
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.appName(ImportDatacite.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
@ -78,45 +90,48 @@ object ImportDatacite {
|
|||
|
||||
import spark.implicits._
|
||||
|
||||
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
|
||||
new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||
|
||||
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
|
||||
override def zero: DataciteType = null
|
||||
|
||||
override def zero: DataciteType = null
|
||||
|
||||
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
if (a.timestamp > b.timestamp) {
|
||||
return a
|
||||
override def reduce(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
if (b == null)
|
||||
return a
|
||||
if (a == null)
|
||||
return b
|
||||
if (a.timestamp > b.timestamp) {
|
||||
return a
|
||||
}
|
||||
b
|
||||
}
|
||||
b
|
||||
|
||||
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
reduce(a, b)
|
||||
}
|
||||
|
||||
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def finish(reduction: DataciteType): DataciteType = reduction
|
||||
}
|
||||
|
||||
override def merge(a: DataciteType, b: DataciteType): DataciteType = {
|
||||
reduce(a, b)
|
||||
}
|
||||
|
||||
override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]]
|
||||
|
||||
override def finish(reduction: DataciteType): DataciteType = reduction
|
||||
}
|
||||
|
||||
val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType]
|
||||
val ts = dump.select(max("timestamp")).first().getLong(0)
|
||||
|
||||
println(s"last Timestamp is $ts")
|
||||
|
||||
val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
val cnt =
|
||||
if ("true".equalsIgnoreCase(spkipImport)) 1
|
||||
else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
|
||||
|
||||
println(s"Imported from Datacite API $cnt documents")
|
||||
|
||||
if (cnt > 0) {
|
||||
|
||||
val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||
val inputRdd: RDD[DataciteType] = sc
|
||||
.sequenceFile(targetPath, classOf[Int], classOf[Text])
|
||||
.map(s => s._2.toString)
|
||||
.map(s => convertAPIStringToDataciteItem(s))
|
||||
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
|
||||
|
@ -129,7 +144,9 @@ object ImportDatacite {
|
|||
.agg(dataciteAggregator.toColumn)
|
||||
.map(s => s._2)
|
||||
.repartition(4000)
|
||||
.write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${dataciteDump}_updated")
|
||||
|
||||
val fs = FileSystem.get(sc.hadoopConfiguration)
|
||||
fs.delete(new Path(s"$dataciteDump"), true)
|
||||
|
@ -137,14 +154,24 @@ object ImportDatacite {
|
|||
}
|
||||
}
|
||||
|
||||
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
|
||||
private def writeSequenceFile(
|
||||
hdfsTargetPath: Path,
|
||||
timestamp: Long,
|
||||
conf: Configuration,
|
||||
bs: Int
|
||||
): Long = {
|
||||
var from: Long = timestamp * 1000
|
||||
val delta: Long = 100000000L
|
||||
var client: DataciteAPIImporter = null
|
||||
val now: Long = System.currentTimeMillis()
|
||||
var i = 0
|
||||
try {
|
||||
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
|
||||
val writer = SequenceFile.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfsTargetPath),
|
||||
SequenceFile.Writer.keyClass(classOf[IntWritable]),
|
||||
SequenceFile.Writer.valueClass(classOf[Text])
|
||||
)
|
||||
try {
|
||||
var start: Long = System.currentTimeMillis
|
||||
while (from < now) {
|
||||
|
@ -153,16 +180,16 @@ object ImportDatacite {
|
|||
val key: IntWritable = new IntWritable(i)
|
||||
val value: Text = new Text
|
||||
while (client.hasNext) {
|
||||
key.set({
|
||||
key.set {
|
||||
i += 1;
|
||||
i - 1
|
||||
})
|
||||
}
|
||||
value.set(client.next())
|
||||
writer.append(key, value)
|
||||
writer.hflush()
|
||||
if (i % 1000 == 0) {
|
||||
end = System.currentTimeMillis
|
||||
val time = (end - start) / 1000.0F
|
||||
val time = (end - start) / 1000.0f
|
||||
println(s"Imported $i in $time seconds")
|
||||
start = System.currentTimeMillis
|
||||
}
|
||||
|
@ -174,8 +201,7 @@ object ImportDatacite {
|
|||
case e: Throwable =>
|
||||
println("Error", e)
|
||||
} finally if (writer != null) writer.close()
|
||||
}
|
||||
catch {
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
log.error("Error", e)
|
||||
}
|
|
@ -17,7 +17,13 @@ object SparkDownloadUpdateDatacite {
|
|||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
|
||||
val parser = new ArgumentApplicationParser(
|
||||
Source
|
||||
.fromInputStream(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")
|
||||
)
|
||||
.mkString
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
|
@ -26,8 +32,9 @@ object SparkDownloadUpdateDatacite {
|
|||
val hdfsuri = parser.get("namenode")
|
||||
log.info(s"namenode is $hdfsuri")
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
val spark: SparkSession = SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
@ -37,13 +44,18 @@ object SparkDownloadUpdateDatacite {
|
|||
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
|
||||
val maxDate: String = spark.read
|
||||
.load(workingPath)
|
||||
.as[Oaf]
|
||||
.filter(s => s.isInstanceOf[Result])
|
||||
.map(r => r.asInstanceOf[Result].getDateofcollection)
|
||||
.select(max("value"))
|
||||
.first()
|
||||
.getString(0)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
val string_to_date = ISO8601FORMAT.parse(maxDate)
|
||||
val ts = string_to_date.getTime
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,597 @@
|
|||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.{compact, parse, render}
|
||||
import collection.JavaConverters._
|
||||
|
||||
object BioDBToOAF {
|
||||
|
||||
case class EBILinkItem(id: Long, links: String) {}
|
||||
|
||||
case class EBILinks(
|
||||
relType: String,
|
||||
date: String,
|
||||
title: String,
|
||||
pmid: String,
|
||||
targetPid: String,
|
||||
targetPidType: String,
|
||||
targetUrl: String
|
||||
) {}
|
||||
|
||||
case class UniprotDate(date: String, date_info: String) {}
|
||||
|
||||
case class ScholixResolved(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
typology: String,
|
||||
tilte: List[String],
|
||||
datasource: List[String],
|
||||
date: List[String],
|
||||
authors: List[String]
|
||||
) {}
|
||||
|
||||
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
"0.9"
|
||||
)
|
||||
val SUBJ_CLASS = "Keywords"
|
||||
|
||||
val DATE_RELATION_KEY = "RelationDate"
|
||||
|
||||
val resolvedURL: Map[String, String] = Map(
|
||||
"genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
|
||||
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
|
||||
"clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
|
||||
"onim" -> "https://omim.org/entry/",
|
||||
"refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
|
||||
"geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
|
||||
)
|
||||
|
||||
val collectedFromMap: Map[String, KeyValue] = {
|
||||
val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2",
|
||||
"Protein Data Bank"
|
||||
)
|
||||
val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::c2a591f440598b63d854556beaf01591",
|
||||
"European Nucleotide Archive"
|
||||
)
|
||||
val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6",
|
||||
"NCBI Nucleotide"
|
||||
)
|
||||
val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|re3data_____::296e1abaf1302897a6838d3588cd0310",
|
||||
"UniProtKB/Swiss-Prot"
|
||||
)
|
||||
val ElsevierCollectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
|
||||
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e",
|
||||
"Springer Nature"
|
||||
)
|
||||
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue(
|
||||
"10|opendoar____::83e60e09c222f206c725385f53d7e567c",
|
||||
"EMBL-EBIs Protein Data Bank in Europe (PDBe)"
|
||||
)
|
||||
val pubmedCollectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
|
||||
PDBCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ElsevierCollectedFrom.setDataInfo(DATA_INFO)
|
||||
EBICollectedFrom.setDataInfo(DATA_INFO)
|
||||
pubmedCollectedFrom.setDataInfo(DATA_INFO)
|
||||
enaCollectedFrom.setDataInfo(DATA_INFO)
|
||||
ncbiCollectedFrom.setDataInfo(DATA_INFO)
|
||||
springerNatureCollectedFrom.setDataInfo(DATA_INFO)
|
||||
|
||||
Map(
|
||||
"uniprot" -> UNIPROTCollectedFrom,
|
||||
"pdb" -> PDBCollectedFrom,
|
||||
"elsevier" -> ElsevierCollectedFrom,
|
||||
"ebi" -> EBICollectedFrom,
|
||||
"Springer Nature" -> springerNatureCollectedFrom,
|
||||
"NCBI Nucleotide" -> ncbiCollectedFrom,
|
||||
"European Nucleotide Archive" -> enaCollectedFrom,
|
||||
"Europe PMC" -> pubmedCollectedFrom
|
||||
)
|
||||
}
|
||||
|
||||
def crossrefLinksToOaf(input: String): Oaf = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
|
||||
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
|
||||
|
||||
val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
|
||||
|
||||
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
|
||||
|
||||
createRelation(
|
||||
target_pid,
|
||||
target_pid_type,
|
||||
generate_unresolved_id(source_pid, source_pid_type),
|
||||
collectedFromMap("elsevier"),
|
||||
"relationship",
|
||||
relation_semantic,
|
||||
date
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.pid.toLowerCase,
|
||||
input.pidType.toLowerCase,
|
||||
input.pidType.toLowerCase,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
|
||||
val nsPrefix = input.pidType.toLowerCase.padTo(12, '_')
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true))
|
||||
|
||||
if (input.tilte != null && input.tilte.nonEmpty)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.tilte.head,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setOriginalId(List(input.pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
|
||||
if (resolvedURL.contains(input.pidType)) {
|
||||
i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava)
|
||||
}
|
||||
|
||||
if (input.pidType.equalsIgnoreCase("clinicaltrials.gov"))
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0037",
|
||||
"Clinical Trial",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
else
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
if (input.datasource == null || input.datasource.isEmpty)
|
||||
return null
|
||||
|
||||
val ds = input.datasource.head
|
||||
d.setCollectedfrom(List(collectedFromMap(ds)).asJava)
|
||||
i.setCollectedfrom(collectedFromMap(ds))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
if (input.authors != null && input.authors.nonEmpty) {
|
||||
val authors = input.authors.map(a => {
|
||||
val authorOAF = new Author
|
||||
authorOAF.setFullname(a)
|
||||
authorOAF
|
||||
})
|
||||
d.setAuthor(authors.asJava)
|
||||
}
|
||||
if (input.date != null && input.date.nonEmpty) {
|
||||
val dt = input.date.head
|
||||
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
def uniprotToOAF(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pid = (json \ "pid").extract[String]
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
pid,
|
||||
"uniprot",
|
||||
"uniprot",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true))
|
||||
d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava)
|
||||
|
||||
val title: String = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title != null)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setOriginalId(List(pid).asJava)
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("uniprot"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val dates: List[UniprotDate] = for {
|
||||
JObject(dateOBJ) <- json \ "dates"
|
||||
JField("date", JString(date)) <- dateOBJ
|
||||
JField("date_info", JString(date_info)) <- dateOBJ
|
||||
} yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info)
|
||||
|
||||
val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null)
|
||||
|
||||
if (subjects != null) {
|
||||
d.setSubject(
|
||||
subjects
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
null
|
||||
)
|
||||
)
|
||||
.asJava
|
||||
)
|
||||
}
|
||||
var i_date: Option[UniprotDate] = None
|
||||
|
||||
if (dates.nonEmpty) {
|
||||
i_date = dates.find(d => d.date_info.contains("entry version"))
|
||||
if (i_date.isDefined) {
|
||||
i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
val relevant_dates: List[StructuredProperty] = dates
|
||||
.filter(d => !d.date_info.contains("entry version"))
|
||||
.map(date =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
date.date,
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.UNKNOWN,
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
ModelConstants.DNET_DATACITE_DATE,
|
||||
DATA_INFO
|
||||
)
|
||||
)
|
||||
if (relevant_dates != null && relevant_dates.nonEmpty)
|
||||
d.setRelevantdate(relevant_dates.asJava)
|
||||
d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO))
|
||||
}
|
||||
|
||||
val references_pmid: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField("PubMed", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
val references_doi: List[String] = for {
|
||||
JObject(reference) <- json \ "references"
|
||||
JField(" DOI", JString(pid)) <- reference
|
||||
} yield pid
|
||||
|
||||
if (references_pmid != null && references_pmid.nonEmpty) {
|
||||
val rel = createRelation(
|
||||
references_pmid.head,
|
||||
"pmid",
|
||||
d.getId,
|
||||
collectedFromMap("uniprot"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
if (i_date.isDefined) i_date.get.date else null
|
||||
)
|
||||
rel.getCollectedfrom
|
||||
List(d, rel)
|
||||
} else if (references_doi != null && references_doi.nonEmpty) {
|
||||
val rel = createRelation(
|
||||
references_doi.head,
|
||||
"doi",
|
||||
d.getId,
|
||||
collectedFromMap("uniprot"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
if (i_date.isDefined) i_date.get.date else null
|
||||
)
|
||||
List(d, rel)
|
||||
} else
|
||||
List(d)
|
||||
}
|
||||
|
||||
def generate_unresolved_id(pid: String, pidType: String): String = {
|
||||
s"unresolved::$pid::$pidType"
|
||||
}
|
||||
|
||||
def createRelation(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
sourceId: String,
|
||||
collectedFrom: KeyValue,
|
||||
subRelType: String,
|
||||
relClass: String,
|
||||
date: String
|
||||
): Relation = {
|
||||
|
||||
val rel = new Relation
|
||||
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
rel.setDataInfo(DATA_INFO)
|
||||
|
||||
rel.setRelType(ModelConstants.RESULT_RESULT)
|
||||
rel.setSubRelType(subRelType)
|
||||
rel.setRelClass(relClass)
|
||||
|
||||
rel.setSource(sourceId)
|
||||
rel.setTarget(s"unresolved::$pid::$pidType")
|
||||
|
||||
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
|
||||
|
||||
rel.setProperties(List(dateProps).asJava)
|
||||
|
||||
rel.getTarget.startsWith("unresolved")
|
||||
rel.setCollectedfrom(List(collectedFrom).asJava)
|
||||
rel
|
||||
|
||||
}
|
||||
|
||||
def createSupplementaryRelation(
|
||||
pid: String,
|
||||
pidType: String,
|
||||
sourceId: String,
|
||||
collectedFrom: KeyValue,
|
||||
date: String
|
||||
): Relation = {
|
||||
createRelation(
|
||||
pid,
|
||||
pidType,
|
||||
sourceId,
|
||||
collectedFrom,
|
||||
ModelConstants.SUPPLEMENT,
|
||||
ModelConstants.IS_SUPPLEMENT_TO,
|
||||
date
|
||||
)
|
||||
}
|
||||
|
||||
def pdbTOOaf(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pdb = (json \ "pdb").extract[String].toLowerCase
|
||||
|
||||
if (pdb.isEmpty)
|
||||
return List()
|
||||
|
||||
val d = new Dataset
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
pdb,
|
||||
"pdb",
|
||||
"Protein Data Bank Identifier",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
d.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true))
|
||||
d.setOriginalId(List(pdb).asJava)
|
||||
|
||||
val title = (json \ "title").extractOrElse[String](null)
|
||||
|
||||
if (title == null)
|
||||
return List()
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null)
|
||||
|
||||
if (authors != null) {
|
||||
val convertedAuthors = authors.zipWithIndex.map { a =>
|
||||
val res = new Author
|
||||
res.setFullname(a._1)
|
||||
res.setRank(a._2 + 1)
|
||||
res
|
||||
}
|
||||
|
||||
d.setAuthor(convertedAuthors.asJava)
|
||||
}
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("pdb"))
|
||||
d.setInstance(List(i).asJava)
|
||||
|
||||
val pmid = (json \ "pmid").extractOrElse[String](null)
|
||||
|
||||
if (pmid != null)
|
||||
List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null))
|
||||
else
|
||||
List(d)
|
||||
}
|
||||
|
||||
def extractEBILinksFromDump(input: String): EBILinkItem = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val pmid = (json \ "publication" \ "pmid").extract[String]
|
||||
val links = (json \ "links").extract[JObject]
|
||||
EBILinkItem(pmid.toLong, compact(render(links)))
|
||||
}
|
||||
|
||||
def EBITargetLinksFilter(input: EBILinks): Boolean = {
|
||||
|
||||
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase(
|
||||
"pdb"
|
||||
) || input.targetPidType.equalsIgnoreCase("uniprot")
|
||||
|
||||
}
|
||||
|
||||
def parse_ebi_links(input: String): List[EBILinks] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "request" \ "id").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("RelationshipType", JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate", JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("IDURL", JString(idUrl)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(
|
||||
relation,
|
||||
GraphCleaningFunctions.cleanDate(publicationDate),
|
||||
title,
|
||||
pmid,
|
||||
id,
|
||||
idScheme,
|
||||
idUrl
|
||||
)
|
||||
}
|
||||
|
||||
def convertEBILinksToOaf(input: EBILinks): List[Oaf] = {
|
||||
val d = new Dataset
|
||||
d.setCollectedfrom(List(collectedFromMap("ebi")).asJava)
|
||||
d.setDataInfo(DATA_INFO)
|
||||
d.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.title,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_')
|
||||
|
||||
d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true))
|
||||
d.setOriginalId(List(input.targetPid.toLowerCase).asJava)
|
||||
|
||||
d.setPid(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
input.targetPid.toLowerCase,
|
||||
input.targetPidType.toLowerCase,
|
||||
"Protein Data Bank Identifier",
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
DATA_INFO
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
val i = new Instance
|
||||
|
||||
i.setPid(d.getPid)
|
||||
i.setUrl(List(input.targetUrl).asJava)
|
||||
i.setInstancetype(
|
||||
OafMapperUtils.qualifier(
|
||||
"0046",
|
||||
"Bioentity",
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||
ModelConstants.DNET_PUBLICATION_RESOURCE
|
||||
)
|
||||
)
|
||||
|
||||
i.setCollectedfrom(collectedFromMap("ebi"))
|
||||
d.setInstance(List(i).asJava)
|
||||
i.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||
)
|
||||
d.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO)
|
||||
)
|
||||
|
||||
List(
|
||||
d,
|
||||
createRelation(
|
||||
input.pmid,
|
||||
"pmid",
|
||||
d.getId,
|
||||
collectedFromMap("ebi"),
|
||||
ModelConstants.RELATIONSHIP,
|
||||
ModelConstants.IS_RELATED_TO,
|
||||
GraphCleaningFunctions.cleanDate(input.date)
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
|
@ -1,12 +1,12 @@
|
|||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import BioDBToOAF.ScholixResolved
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkTransformBioDatabaseToOAF {
|
||||
|
@ -14,7 +14,11 @@ object SparkTransformBioDatabaseToOAF {
|
|||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val database: String = parser.get("database")
|
||||
log.info("database: {}", database)
|
||||
|
@ -29,20 +33,33 @@ object SparkTransformBioDatabaseToOAF {
|
|||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
val sc = spark.sparkContext
|
||||
|
||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
import spark.implicits._
|
||||
database.toUpperCase() match {
|
||||
case "UNIPROT" =>
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))),
|
||||
targetPath
|
||||
)
|
||||
case "PDB" =>
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))),
|
||||
targetPath
|
||||
)
|
||||
case "SCHOLIX" =>
|
||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)),
|
||||
targetPath
|
||||
)
|
||||
case "CROSSREF_LINKS" =>
|
||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))),
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
@ -1,9 +1,10 @@
|
|||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.oaf.Result
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed._
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
@ -23,31 +24,37 @@ import scala.xml.pull.XMLEventReader
|
|||
|
||||
object SparkCreateBaselineDataFrame {
|
||||
|
||||
|
||||
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
|
||||
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
||||
|
||||
val result = data.lines.filter(l => l.startsWith("<a href=")).map { l =>
|
||||
val end = l.lastIndexOf("\">")
|
||||
val start = l.indexOf("<a href=\"")
|
||||
val result = data.lines
|
||||
.filter(l => l.startsWith("<a href="))
|
||||
.map { l =>
|
||||
val end = l.lastIndexOf("\">")
|
||||
val start = l.indexOf("<a href=\"")
|
||||
|
||||
if (start >= 0 && end > start)
|
||||
l.substring(start + 9, end - start)
|
||||
else
|
||||
""
|
||||
}.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
|
||||
if (start >= 0 && end > start)
|
||||
l.substring(start + 9, end - start)
|
||||
else
|
||||
""
|
||||
}
|
||||
.filter(s => s.endsWith(".gz"))
|
||||
.filter(s => s > maxFile)
|
||||
.map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s"))
|
||||
.toList
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
def downloadBaselinePart(url: String): InputStream = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
val response = client.execute(r)
|
||||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
|
@ -58,10 +65,12 @@ object SparkCreateBaselineDataFrame {
|
|||
def requestPage(url: String): String = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -72,8 +81,7 @@ object SparkCreateBaselineDataFrame {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
@ -89,10 +97,8 @@ object SparkCreateBaselineDataFrame {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
|
||||
|
||||
|
||||
val conf = new Configuration
|
||||
conf.set("fs.defaultFS", hdfsServerUri)
|
||||
val fs = FileSystem.get(conf)
|
||||
|
@ -121,31 +127,36 @@ object SparkCreateBaselineDataFrame {
|
|||
|
||||
}
|
||||
|
||||
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] =
|
||||
new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||
override def zero: PMArticle = new PMArticle
|
||||
|
||||
val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable {
|
||||
override def zero: PMArticle = new PMArticle
|
||||
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
||||
if (b != null && b.getPmid != null) b else a._2
|
||||
}
|
||||
|
||||
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
|
||||
if (b != null && b.getPmid != null) b else a._2
|
||||
override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
|
||||
if (b1 != null && b1.getPmid != null) b1 else b2
|
||||
|
||||
}
|
||||
|
||||
override def finish(reduction: PMArticle): PMArticle = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
|
||||
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
}
|
||||
|
||||
override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
|
||||
if (b1 != null && b1.getPmid != null) b1 else b2
|
||||
|
||||
}
|
||||
|
||||
override def finish(reduction: PMArticle): PMArticle = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
|
||||
override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle]
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
SparkEBILinksToOaf.getClass.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json"
|
||||
)
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
@ -161,7 +172,6 @@ object SparkCreateBaselineDataFrame {
|
|||
val skipUpdate = parser.get("skipUpdate")
|
||||
log.info("skipUpdate: {}", skipUpdate)
|
||||
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
val spark: SparkSession =
|
||||
|
@ -169,7 +179,8 @@ object SparkCreateBaselineDataFrame {
|
|||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
import spark.implicits._
|
||||
|
@ -177,24 +188,35 @@ object SparkCreateBaselineDataFrame {
|
|||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
|
||||
if (!"true".equalsIgnoreCase(skipUpdate)) {
|
||||
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
|
||||
val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
}))
|
||||
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
|
||||
val ds: Dataset[PMArticle] = spark.createDataset(
|
||||
k.filter(i => i._1.endsWith(".gz"))
|
||||
.flatMap(i => {
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
})
|
||||
)
|
||||
ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder))
|
||||
.groupByKey(_._1)
|
||||
.agg(pmArticleAggregator.toColumn)
|
||||
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
||||
.map(p => p._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/baseline_dataset")
|
||||
}
|
||||
|
||||
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
|
||||
exported_dataset
|
||||
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
|
||||
.filter(p => p != null)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
exported_dataset
|
||||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
.as[Oaf]
|
||||
.filter(p => p != null),
|
||||
targetPath
|
||||
)
|
||||
|
||||
}
|
||||
}
|
|
@ -1,9 +1,8 @@
|
|||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.HttpGet
|
||||
|
@ -26,10 +25,12 @@ object SparkDownloadEBILinks {
|
|||
def requestPage(url: String): String = {
|
||||
val r = new HttpGet(url)
|
||||
val timeout = 60; // seconds
|
||||
val config = RequestConfig.custom()
|
||||
val config = RequestConfig
|
||||
.custom()
|
||||
.setConnectTimeout(timeout * 1000)
|
||||
.setConnectionRequestTimeout(timeout * 1000)
|
||||
.setSocketTimeout(timeout * 1000).build()
|
||||
.setSocketTimeout(timeout * 1000)
|
||||
.build()
|
||||
val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build()
|
||||
try {
|
||||
var tries = 4
|
||||
|
@ -40,8 +41,7 @@ object SparkDownloadEBILinks {
|
|||
println(s"get response with status${response.getStatusLine.getStatusCode}")
|
||||
if (response.getStatusLine.getStatusCode > 400) {
|
||||
tries -= 1
|
||||
}
|
||||
else
|
||||
} else
|
||||
return IOUtils.toString(response.getEntity.getContent)
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
|
@ -67,14 +67,19 @@ object SparkDownloadEBILinks {
|
|||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val MAX_ITEM_PER_PARTITION = 20000
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
|
@ -88,22 +93,40 @@ object SparkDownloadEBILinks {
|
|||
log.info(s"workingPath -> $workingPath")
|
||||
|
||||
log.info("Getting max pubmedId where the links have already requested")
|
||||
val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
||||
val links: Dataset[EBILinkItem] =
|
||||
spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
|
||||
val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
|
||||
|
||||
log.info("Retrieving PMID to request links")
|
||||
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
|
||||
pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
|
||||
pubmed
|
||||
.map(p => p.getPmid.toLong)
|
||||
.where(s"value > $lastPMIDRequested")
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/id_to_request")
|
||||
|
||||
val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
|
||||
|
||||
val total = pmidToReq.count()
|
||||
|
||||
spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
|
||||
spark
|
||||
.createDataset(
|
||||
pmidToReq.rdd
|
||||
.repartition((total / MAX_ITEM_PER_PARTITION).toInt)
|
||||
.map(pmid => createEBILinks(pmid))
|
||||
.filter(l => l != null)
|
||||
)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/links_update")
|
||||
|
||||
val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
||||
val updates: Dataset[EBILinkItem] =
|
||||
spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
|
||||
|
||||
links.union(updates).groupByKey(_.id)
|
||||
links
|
||||
.union(updates)
|
||||
.groupByKey(_.id)
|
||||
.reduceGroups { (x, y) =>
|
||||
if (x == null || x.links == null)
|
||||
y
|
||||
|
@ -113,6 +136,10 @@ object SparkDownloadEBILinks {
|
|||
x
|
||||
else
|
||||
y
|
||||
}.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")
|
||||
}
|
||||
.map(_._2)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/links_final")
|
||||
}
|
||||
}
|
|
@ -1,11 +1,10 @@
|
|||
package eu.dnetlib.dhp.sx.bio.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
|
||||
import BioDBToOAF.EBILinkItem
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
|
@ -16,15 +15,19 @@ object SparkEBILinksToOaf {
|
|||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
|
||||
val parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(
|
||||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")
|
||||
)
|
||||
)
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkEBILinksToOaf.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
|
@ -33,12 +36,17 @@ object SparkEBILinksToOaf {
|
|||
log.info(s"targetPath -> $targetPath")
|
||||
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
|
||||
val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
|
||||
val ebLinks: Dataset[EBILinkItem] = spark.read
|
||||
.load(sourcePath)
|
||||
.as[EBILinkItem]
|
||||
.filter(l => l.links != null && l.links.startsWith("{"))
|
||||
|
||||
ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
|
||||
.flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
CollectionUtils.saveDataset(
|
||||
ebLinks
|
||||
.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
|
||||
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
|
||||
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)),
|
||||
targetPath
|
||||
)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
package eu.dnetlib.dhp.sx.bio.pubmed
|
||||
|
||||
import scala.xml.MetaData
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||
|
||||
/** @param xml
|
||||
*/
|
||||
class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] {
|
||||
|
||||
var currentArticle: PMArticle = generateNextArticle()
|
||||
|
||||
override def hasNext: Boolean = currentArticle != null
|
||||
|
||||
override def next(): PMArticle = {
|
||||
val tmp = currentArticle
|
||||
currentArticle = generateNextArticle()
|
||||
tmp
|
||||
}
|
||||
|
||||
def extractAttributes(attrs: MetaData, key: String): String = {
|
||||
|
||||
val res = attrs.get(key)
|
||||
if (res.isDefined) {
|
||||
val s = res.get
|
||||
if (s != null && s.nonEmpty)
|
||||
s.head.text
|
||||
else
|
||||
null
|
||||
} else null
|
||||
}
|
||||
|
||||
def validate_Date(year: String, month: String, day: String): String = {
|
||||
try {
|
||||
f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d"
|
||||
|
||||
} catch {
|
||||
case _: Throwable => null
|
||||
}
|
||||
}
|
||||
|
||||
def generateNextArticle(): PMArticle = {
|
||||
|
||||
var currentSubject: PMSubject = null
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currentGrant: PMGrant = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
var currentArticleType: String = null
|
||||
|
||||
while (xml.hasNext) {
|
||||
xml.next match {
|
||||
case EvElemStart(_, label, attrs, _) =>
|
||||
currNode = label
|
||||
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case "Grant" => currentGrant = new PMGrant
|
||||
case "PublicationType" | "DescriptorName" =>
|
||||
currentSubject = new PMSubject
|
||||
currentSubject.setMeshId(extractAttributes(attrs, "UI"))
|
||||
case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType")
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "Grant" => currentArticle.getGrants.add(currentGrant)
|
||||
case "PubMedPubDate" =>
|
||||
if (currentArticle.getDate == null)
|
||||
currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay))
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "DescriptorName" => currentArticle.getSubjects.add(currentSubject)
|
||||
case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject)
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode != null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle == null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription == null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ArticleId" =>
|
||||
if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim)
|
||||
case "Language" => currentArticle.setLanguage(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "GrantID" => currentGrant.setGrantID(text.trim)
|
||||
case "Agency" => currentGrant.setAgency(text.trim)
|
||||
case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume(text.trim)
|
||||
case "Issue" => currentJournal.setIssue(text.trim)
|
||||
case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
}
|
||||
case "ForeName" =>
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle == null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
}
|
|
@ -4,36 +4,43 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
|||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import scala.collection.JavaConverters._
|
||||
import collection.JavaConverters._
|
||||
|
||||
import java.util.regex.Pattern
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
*/
|
||||
object PubMedToOaf {
|
||||
|
||||
val SUBJ_CLASS = "keywords"
|
||||
|
||||
val urlMap = Map(
|
||||
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
|
||||
"doi" -> "https://dx.doi.org/"
|
||||
"doi" -> "https://dx.doi.org/"
|
||||
)
|
||||
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
|
||||
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
val dataInfo: DataInfo = OafMapperUtils.dataInfo(
|
||||
false,
|
||||
null,
|
||||
false,
|
||||
false,
|
||||
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
|
||||
"0.9"
|
||||
)
|
||||
|
||||
val collectedFrom: KeyValue =
|
||||
OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
|
||||
|
||||
/**
|
||||
* Cleaning the DOI Applying regex in order to
|
||||
* remove doi starting with URL
|
||||
* @param doi input DOI
|
||||
* @return cleaned DOI
|
||||
*/
|
||||
/** Cleaning the DOI Applying regex in order to
|
||||
* remove doi starting with URL
|
||||
*
|
||||
* @param doi input DOI
|
||||
* @return cleaned DOI
|
||||
*/
|
||||
def cleanDoi(doi: String): String = {
|
||||
|
||||
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
|
||||
|
||||
|
||||
val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE)
|
||||
val matcher = pattern.matcher(doi)
|
||||
|
||||
|
@ -43,33 +50,34 @@ object PubMedToOaf {
|
|||
null
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Create an instance of class extends Result
|
||||
* starting from OAF instanceType value
|
||||
*
|
||||
* @param cobjQualifier OAF instance type
|
||||
* @param vocabularies All dnet vocabularies
|
||||
* @return the correct instance
|
||||
*/
|
||||
/** Create an instance of class extends Result
|
||||
* starting from OAF instanceType value
|
||||
*
|
||||
* @param cobjQualifier OAF instance type
|
||||
* @param vocabularies All dnet vocabularies
|
||||
* @return the correct instance
|
||||
*/
|
||||
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
|
||||
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
|
||||
val result_typologies = getVocabularyTerm(
|
||||
ModelConstants.DNET_RESULT_TYPOLOGIES,
|
||||
vocabularies,
|
||||
cobjQualifier.getClassid
|
||||
)
|
||||
result_typologies.getClassid match {
|
||||
case "dataset" => new Dataset
|
||||
case "dataset" => new Dataset
|
||||
case "publication" => new Publication
|
||||
case "other" => new OtherResearchProduct
|
||||
case "software" => new Software
|
||||
case _ => null
|
||||
case "other" => new OtherResearchProduct
|
||||
case "software" => new Software
|
||||
case _ => null
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Mapping the Pubmedjournal info into the OAF Journale
|
||||
*
|
||||
* @param j the pubmedJournal
|
||||
* @return the OAF Journal
|
||||
*/
|
||||
/** Mapping the Pubmedjournal info into the OAF Journale
|
||||
*
|
||||
* @param j the pubmedJournal
|
||||
* @return the OAF Journal
|
||||
*/
|
||||
def mapJournal(j: PMJournal): Journal = {
|
||||
if (j == null)
|
||||
return null
|
||||
|
@ -83,42 +91,47 @@ object PubMedToOaf {
|
|||
journal.setIss(j.getIssue)
|
||||
journal
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Find vocabulary term into synonyms and term in the vocabulary
|
||||
*
|
||||
* @param vocabularyName the input vocabulary name
|
||||
* @param vocabularies all the vocabularies
|
||||
* @param term the term to find
|
||||
*
|
||||
* @return the cleaned term value
|
||||
*/
|
||||
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
|
||||
/** Find vocabulary term into synonyms and term in the vocabulary
|
||||
*
|
||||
* @param vocabularyName the input vocabulary name
|
||||
* @param vocabularies all the vocabularies
|
||||
* @param term the term to find
|
||||
* @return the cleaned term value
|
||||
*/
|
||||
def getVocabularyTerm(
|
||||
vocabularyName: String,
|
||||
vocabularies: VocabularyGroup,
|
||||
term: String
|
||||
): Qualifier = {
|
||||
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
|
||||
val b = vocabularies.getTermAsQualifier(vocabularyName, term)
|
||||
if (a == null) b else a
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Map the Pubmed Article into the OAF instance
|
||||
*
|
||||
*
|
||||
* @param article the pubmed articles
|
||||
* @param vocabularies the vocabularies
|
||||
* @return The OAF instance if the mapping did not fail
|
||||
*/
|
||||
def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = {
|
||||
/** Map the Pubmed Article into the OAF instance
|
||||
*
|
||||
* @param article the pubmed articles
|
||||
* @param vocabularies the vocabularies
|
||||
* @return The OAF instance if the mapping did not fail
|
||||
*/
|
||||
def convert(article: PMArticle, vocabularies: VocabularyGroup): Oaf = {
|
||||
|
||||
if (article.getPublicationTypes == null)
|
||||
return null
|
||||
|
||||
|
||||
// MAP PMID into pid with classid = classname = pmid
|
||||
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
|
||||
val pidList: List[StructuredProperty] = List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
article.getPmid,
|
||||
PidType.pmid.toString,
|
||||
PidType.pmid.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
dataInfo
|
||||
)
|
||||
)
|
||||
if (pidList == null)
|
||||
return null
|
||||
|
||||
|
@ -127,7 +140,14 @@ object PubMedToOaf {
|
|||
if (article.getDoi != null) {
|
||||
val normalizedPid = cleanDoi(article.getDoi)
|
||||
if (normalizedPid != null)
|
||||
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
|
||||
alternateIdentifier = OafMapperUtils.structuredProperty(
|
||||
normalizedPid,
|
||||
PidType.doi.toString,
|
||||
PidType.doi.toString,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES,
|
||||
dataInfo
|
||||
)
|
||||
}
|
||||
|
||||
// INSTANCE MAPPING
|
||||
|
@ -135,10 +155,12 @@ object PubMedToOaf {
|
|||
|
||||
// If the article contains the typology Journal Article then we apply this type
|
||||
//else We have to find a terms that match the vocabulary otherwise we discard it
|
||||
val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
|
||||
val ja =
|
||||
article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
|
||||
val pubmedInstance = new Instance
|
||||
if (ja.isDefined) {
|
||||
val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
|
||||
val cojbCategory =
|
||||
getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
|
||||
pubmedInstance.setInstancetype(cojbCategory)
|
||||
} else {
|
||||
val i_type = article.getPublicationTypes.asScala
|
||||
|
@ -157,7 +179,9 @@ object PubMedToOaf {
|
|||
if (alternateIdentifier != null)
|
||||
pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
|
||||
result.setInstance(List(pubmedInstance).asJava)
|
||||
pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
|
||||
pubmedInstance.getPid.asScala
|
||||
.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid))
|
||||
.map(p => p.getValue)(collection.breakOut)
|
||||
//CREATE URL From pmid
|
||||
val urlLists: List[String] = pidList
|
||||
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
|
||||
|
@ -167,7 +191,9 @@ object PubMedToOaf {
|
|||
pubmedInstance.setUrl(urlLists.asJava)
|
||||
|
||||
//ASSIGN DateofAcceptance
|
||||
pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
||||
pubmedInstance.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
|
||||
)
|
||||
//ASSIGN COLLECTEDFROM
|
||||
pubmedInstance.setCollectedfrom(collectedFrom)
|
||||
result.setPid(pidList.asJava)
|
||||
|
@ -175,7 +201,6 @@ object PubMedToOaf {
|
|||
//END INSTANCE MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
// JOURNAL MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
if (article.getJournal != null && result.isInstanceOf[Publication])
|
||||
|
@ -184,32 +209,48 @@ object PubMedToOaf {
|
|||
//END JOURNAL MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
// RESULT MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
|
||||
result.setDateofacceptance(
|
||||
OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)
|
||||
)
|
||||
|
||||
if (article.getTitle == null || article.getTitle.isEmpty)
|
||||
return null
|
||||
result.setTitle(List(OafMapperUtils.structuredProperty(article.getTitle, ModelConstants.MAIN_TITLE_QUALIFIER, dataInfo)).asJava)
|
||||
result.setTitle(
|
||||
List(
|
||||
OafMapperUtils.structuredProperty(
|
||||
article.getTitle,
|
||||
ModelConstants.MAIN_TITLE_QUALIFIER,
|
||||
dataInfo
|
||||
)
|
||||
).asJava
|
||||
)
|
||||
|
||||
if (article.getDescription != null && article.getDescription.nonEmpty)
|
||||
result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava)
|
||||
|
||||
if (article.getLanguage != null) {
|
||||
|
||||
val term = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
|
||||
val term =
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage)
|
||||
if (term != null)
|
||||
result.setLanguage(term)
|
||||
}
|
||||
|
||||
|
||||
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
|
||||
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s =>
|
||||
OafMapperUtils.structuredProperty(
|
||||
s.getValue,
|
||||
SUBJ_CLASS,
|
||||
SUBJ_CLASS,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
|
||||
dataInfo
|
||||
)
|
||||
)(collection.breakOut)
|
||||
if (subjects != null)
|
||||
result.setSubject(subjects.asJava)
|
||||
|
||||
|
||||
val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) =>
|
||||
val author = new Author()
|
||||
author.setName(a.getForeName)
|
||||
|
@ -219,15 +260,12 @@ object PubMedToOaf {
|
|||
author
|
||||
}(collection.breakOut)
|
||||
|
||||
|
||||
if (authors != null && authors.nonEmpty)
|
||||
result.setAuthor(authors.asJava)
|
||||
result.setOriginalId(pidList.map(s => s.getValue).asJava)
|
||||
|
||||
|
||||
result.setId(article.getPmid)
|
||||
|
||||
|
||||
// END RESULT MAPPING
|
||||
//--------------------------------------------------------------------------------------
|
||||
val id = IdentifierFactory.createIdentifier(result)
|
||||
|
@ -237,5 +275,4 @@ object PubMedToOaf {
|
|||
result
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,345 @@
|
|||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import eu.dnetlib.dhp.application.AbstractScalaApplication
|
||||
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||
import eu.dnetlib.dhp.datacite.{DataciteToOAFTransformation, DataciteType}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource}
|
||||
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
|
||||
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
|
||||
import eu.dnetlib.dhp.utils.{DHPUtils, ISLookupClientFactory}
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.spark.sql.functions.max
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import java.text.SimpleDateFormat
|
||||
|
||||
class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger)
|
||||
extends AbstractScalaApplication(propertyPath, args, log: Logger) {
|
||||
|
||||
val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ"
|
||||
val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN)
|
||||
|
||||
val SCHOLIX_RESOURCE_PATH_NAME = "scholixResource"
|
||||
val DATACITE_OAF_PATH_NAME = "dataciteOAFUpdate"
|
||||
val PID_MAP_PATH_NAME = "pidMap"
|
||||
val RESOLVED_REL_PATH_NAME = "resolvedRelation"
|
||||
val SCHOLIX_PATH_NAME = "scholix"
|
||||
|
||||
def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME"
|
||||
def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME"
|
||||
def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME"
|
||||
def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME"
|
||||
def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME"
|
||||
|
||||
/** Utility to parse Date in ISO8601 to epochMillis
|
||||
* @param inputDate The String represents an input date in ISO8601
|
||||
* @return The relative epochMillis of parsed date
|
||||
*/
|
||||
def ISO8601toEpochMillis(inputDate: String): Long = {
|
||||
simpleFormatter.parse(inputDate).getTime
|
||||
}
|
||||
|
||||
/** This method tries to retrieve the last collection date from all datacite
|
||||
* records in HDFS.
|
||||
* This method should be called before indexing scholexplorer to retrieve
|
||||
* the delta of Datacite record to download, since from the generation of
|
||||
* raw graph to the generation of Scholexplorer sometimes it takes 20 days
|
||||
* @param spark
|
||||
* @param entitiesPath
|
||||
* @return the last collection date from the current scholexplorer Graph of the datacite records
|
||||
*/
|
||||
def retrieveLastCollectedFrom(spark: SparkSession, entitiesPath: String): Long = {
|
||||
log.info("Retrieve last entities collected From")
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
import spark.implicits._
|
||||
|
||||
val entitiesDS = spark.read
|
||||
.load(s"$entitiesPath/*")
|
||||
.as[Oaf]
|
||||
.filter(o => o.isInstanceOf[Result])
|
||||
.map(r => r.asInstanceOf[Result])
|
||||
|
||||
val date = entitiesDS
|
||||
.filter(r => r.getDateofcollection != null)
|
||||
.map(_.getDateofcollection)
|
||||
.select(max("value"))
|
||||
.first
|
||||
.getString(0)
|
||||
|
||||
ISO8601toEpochMillis(date) / 1000
|
||||
}
|
||||
|
||||
/** The method of update Datacite relationships on Scholexplorer
|
||||
* needs some utilities data structures
|
||||
* One is the scholixResource DS that stores all the nodes in the Scholix Graph
|
||||
* in format ScholixResource
|
||||
* @param summaryPath the path of the summary in Scholix
|
||||
* @param workingPath the working path
|
||||
* @param spark the spark session
|
||||
*/
|
||||
def generateScholixResource(
|
||||
summaryPath: String,
|
||||
workingPath: String,
|
||||
spark: SparkSession
|
||||
): Unit = {
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
|
||||
log.info("Convert All summary to ScholixResource")
|
||||
spark.read
|
||||
.load(summaryPath)
|
||||
.as[ScholixSummary]
|
||||
.map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder)
|
||||
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${scholixResourcePath(workingPath)}_native")
|
||||
}
|
||||
|
||||
/** This method convert the new Datacite Resource into Scholix Resource
|
||||
* Needed to fill the source and the type of Scholix Relationships
|
||||
* @param workingPath the Working Path
|
||||
* @param spark The spark Session
|
||||
*/
|
||||
def addMissingScholixResource(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
|
||||
import spark.implicits._
|
||||
|
||||
spark.read
|
||||
.load(dataciteOAFPath(workingPath))
|
||||
.as[Oaf]
|
||||
.filter(_.isInstanceOf[Result])
|
||||
.map(_.asInstanceOf[Result])
|
||||
.map(ScholixUtils.generateScholixResourceFromResult)
|
||||
.filter(r => r.getIdentifier != null && r.getIdentifier.size > 0)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${scholixResourcePath(workingPath)}_update")
|
||||
|
||||
val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource]
|
||||
val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource]
|
||||
val graph = update
|
||||
.union(native)
|
||||
.groupByKey(_.getDnetIdentifier)
|
||||
.reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b)
|
||||
.map(_._2)
|
||||
graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph")
|
||||
}
|
||||
|
||||
/** This method get and Transform only datacite records with
|
||||
* timestamp greater than timestamp
|
||||
* @param datacitePath the datacite input Path
|
||||
* @param timestamp the timestamp
|
||||
* @param workingPath the working path where save the generated Dataset
|
||||
* @param spark SparkSession
|
||||
* @param vocabularies Vocabularies needed for transformation
|
||||
*/
|
||||
|
||||
def getDataciteUpdate(
|
||||
datacitePath: String,
|
||||
timestamp: Long,
|
||||
workingPath: String,
|
||||
spark: SparkSession,
|
||||
vocabularies: VocabularyGroup
|
||||
): Long = {
|
||||
import spark.implicits._
|
||||
val ds = spark.read.load(datacitePath).as[DataciteType]
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
val total = ds.filter(_.timestamp >= timestamp).count()
|
||||
if (total > 0) {
|
||||
ds.filter(_.timestamp >= timestamp)
|
||||
.flatMap(d =>
|
||||
DataciteToOAFTransformation
|
||||
.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true)
|
||||
)
|
||||
.flatMap(i => fixRelations(i))
|
||||
.filter(i => i != null)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(dataciteOAFPath(workingPath))
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
/** After added the new ScholixResource, we need to update the scholix Pid Map
|
||||
* to intersected with the new Datacite Relations
|
||||
*
|
||||
* @param workingPath The working Path starting from save the new Map
|
||||
* @param spark the spark session
|
||||
*/
|
||||
def generatePidMap(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
import spark.implicits._
|
||||
spark.read
|
||||
.load(s"${scholixResourcePath(workingPath)}_graph")
|
||||
.as[ScholixResource]
|
||||
.flatMap(r =>
|
||||
r.getIdentifier.asScala
|
||||
.map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema))
|
||||
.map(t => (t, r.getDnetIdentifier))
|
||||
)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.groupByKey(_._1)
|
||||
.reduceGroups((a, b) => if (a != null && a._2 != null) a else b)
|
||||
.map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(pidMapPath(workingPath))
|
||||
}
|
||||
|
||||
/** This method resolve the datacite relation and filter the resolved
|
||||
* relation
|
||||
* @param workingPath the working path
|
||||
* @param spark the spark session
|
||||
*/
|
||||
|
||||
def resolveUpdateRelation(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
import spark.implicits._
|
||||
|
||||
val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)]
|
||||
|
||||
val unresolvedRelations: Dataset[(String, Relation)] = spark.read
|
||||
.load(dataciteOAFPath(workingPath))
|
||||
.as[Oaf]
|
||||
.filter(_.isInstanceOf[Relation])
|
||||
.map(_.asInstanceOf[Relation])
|
||||
.map { r =>
|
||||
if (r.getSource.startsWith("unresolved"))
|
||||
(r.getSource, r)
|
||||
else
|
||||
(r.getTarget, r)
|
||||
}(Encoders.tuple(Encoders.STRING, relationEncoder))
|
||||
|
||||
unresolvedRelations
|
||||
.joinWith(pidMap, unresolvedRelations("_1").equalTo(pidMap("_1")))
|
||||
.map(t => {
|
||||
val r = t._1._2
|
||||
val resolvedIdentifier = t._2._2
|
||||
if (r.getSource.startsWith("unresolved"))
|
||||
r.setSource(resolvedIdentifier)
|
||||
else
|
||||
r.setTarget(resolvedIdentifier)
|
||||
r
|
||||
})(relationEncoder)
|
||||
.filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved")))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(resolvedRelationPath(workingPath))
|
||||
}
|
||||
|
||||
/** This method generate scholix starting from resolved relation
|
||||
*
|
||||
* @param workingPath
|
||||
* @param spark
|
||||
*/
|
||||
def generateScholixUpdate(workingPath: String, spark: SparkSession): Unit = {
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource]
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val intermediateEncoder: Encoder[(String, Scholix)] =
|
||||
Encoders.tuple(Encoders.STRING, scholixEncoder)
|
||||
|
||||
val relations: Dataset[(String, Relation)] = spark.read
|
||||
.load(resolvedRelationPath(workingPath))
|
||||
.as[Relation]
|
||||
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder))
|
||||
|
||||
val id_summary: Dataset[(String, ScholixResource)] = spark.read
|
||||
.load(s"${scholixResourcePath(workingPath)}_graph")
|
||||
.as[ScholixResource]
|
||||
.map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder))
|
||||
|
||||
id_summary.cache()
|
||||
|
||||
relations
|
||||
.joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner")
|
||||
.map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2)))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/scholix_one_verse")
|
||||
|
||||
val source_scholix: Dataset[(String, Scholix)] =
|
||||
spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)]
|
||||
|
||||
source_scholix
|
||||
.joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner")
|
||||
.map(t => {
|
||||
val target: ScholixResource = t._2._2
|
||||
val scholix: Scholix = t._1._2
|
||||
ScholixUtils.generateCompleteScholix(scholix, target)
|
||||
})(scholixEncoder)
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$workingPath/scholix")
|
||||
}
|
||||
|
||||
/** Here all the spark applications runs this method
|
||||
* where the whole logic of the spark node is defined
|
||||
*/
|
||||
override def run(): Unit = {
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"SourcePath is '$sourcePath'")
|
||||
|
||||
val datacitePath = parser.get("datacitePath")
|
||||
log.info(s"DatacitePath is '$datacitePath'")
|
||||
|
||||
val workingPath = parser.get("workingSupportPath")
|
||||
log.info(s"workingPath is '$workingPath'")
|
||||
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
require(vocabularies != null)
|
||||
|
||||
val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS"))
|
||||
log.info(s"updateDS is '$updateDS'")
|
||||
|
||||
var lastCollectionDate = 0L
|
||||
if (updateDS) {
|
||||
generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark)
|
||||
log.info("Retrieve last entities collected From starting from scholix Graph")
|
||||
lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities")
|
||||
} else {
|
||||
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
|
||||
fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true)
|
||||
fs.rename(
|
||||
new Path(s"${scholixResourcePath(workingPath)}_graph"),
|
||||
new Path(s"${scholixResourcePath(workingPath)}_native")
|
||||
)
|
||||
lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath))
|
||||
}
|
||||
|
||||
val numRecords =
|
||||
getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies)
|
||||
if (numRecords > 0) {
|
||||
addMissingScholixResource(workingPath, spark)
|
||||
generatePidMap(workingPath, spark)
|
||||
resolveUpdateRelation(workingPath, spark)
|
||||
generateScholixUpdate(workingPath, spark)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object SparkRetrieveDataciteDelta {
|
||||
val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
new SparkRetrieveDataciteDelta(
|
||||
"/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json",
|
||||
args,
|
||||
log
|
||||
).initialize().run()
|
||||
}
|
||||
}
|
|
@ -1,9 +1,20 @@
|
|||
##DHP-Aggregation
|
||||
|
||||
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
|
||||
This module defines a set of oozie workflows for
|
||||
|
||||
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
|
||||
1. the **collection** and **transformation** of metadata records.
|
||||
2. the **integration** of new external information in the result
|
||||
|
||||
|
||||
### Collection and Transformation
|
||||
|
||||
The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
|
||||
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
|
||||
of each MDStore.
|
||||
|
||||
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
|
||||
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
|
||||
|
||||
### Integration of external information in the result
|
||||
|
||||
The workflows create new entity in the OpenAIRE format (OAF) whose aim is to enrich the result already contained in the graph.
|
||||
See integration section for more insight
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
DHP Aggregation - Integration method
|
||||
=====================================
|
||||
|
||||
The integration method can be applied every time new information, which is not aggregated from the repositories
|
||||
nor computed directly by OpenAIRE, should be added to the results of the graph.
|
||||
|
||||
The information integrated so far is:
|
||||
|
||||
1. Article impact measures
|
||||
1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores
|
||||
2. Result Subjects
|
||||
1. Integration of Fields of Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS)) classification in
|
||||
results subjects.
|
||||
|
||||
|
||||
The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id
|
||||
and the element in the OAF model that should be used to map the information we want to integrate.
|
||||
|
||||
The id is set by using a particular encoding of the given PID
|
||||
|
||||
*unresolved::[pid]::[pidtype]*
|
||||
|
||||
where
|
||||
|
||||
1. *unresolved* is a constant value
|
||||
2. *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307
|
||||
3. *pidtype* is the persistent id type, e.g. doi
|
||||
|
||||
Such entities are matched against those available in the graph using the result.instance.pid values.
|
||||
|
||||
This mechanism can be used to integrate enrichments produced as associated by a given PID.
|
||||
If a match will be found with one of the results already in the graph that said result will be enriched with the information
|
||||
present in the new OAF.
|
||||
All the entities for which a match is not found are discarded.
|
||||
|
||||
|
|
@ -4,13 +4,13 @@ This section describes the mapping implemented for [MEDLINE/PubMed](https://pubm
|
|||
Collection
|
||||
---------
|
||||
The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with
|
||||
the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
|
||||
the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
|
||||
|
||||
|
||||
Parsing
|
||||
-------
|
||||
The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
|
||||
an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
|
||||
The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
|
||||
an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
|
||||
|
||||
|
||||
Mapping
|
||||
|
@ -50,6 +50,10 @@ The table below describes the mapping from the XML Native to the OAF mapping
|
|||
|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
|
||||
|FOR ALL AUTHOR | author.rank| sequential number starting from 1|
|
||||
|
||||
#TODO
|
||||
|
||||
Missing item mapped
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -20,7 +20,9 @@
|
|||
<item name="Pubmed" href="pubmed.html"/>
|
||||
<item name="Datacite" href="datacite.html"/>
|
||||
</item>
|
||||
<item name="Release Notes" href="release-notes.html" />
|
||||
<item name="Integration" href="integration.html" collapse="true">
|
||||
|
||||
</item>
|
||||
<item name="General Information" href="about.html"/>
|
||||
|
||||
<item name="JavaDoc" href="apidocs/" />
|
||||
|
|
|
@ -28,6 +28,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class SparkAtomicActionScoreJobTest {
|
||||
|
||||
|
@ -69,13 +70,9 @@ public class SparkAtomicActionScoreJobTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void matchOne() throws Exception {
|
||||
void testMatch() throws Exception {
|
||||
String bipScoresPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
||||
.getPath();
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json")
|
||||
.getPath();
|
||||
|
||||
SparkAtomicActionScoreJob
|
||||
|
@ -84,234 +81,57 @@ public class SparkAtomicActionScoreJobTest {
|
|||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
inputPath,
|
||||
"-bipScorePath",
|
||||
|
||||
bipScoresPath,
|
||||
"-resultTableName",
|
||||
"eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Publication> tmp = sc
|
||||
JavaRDD<Result> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Publication) aa.getPayload()));
|
||||
.map(aa -> ((Result) aa.getPayload()));
|
||||
|
||||
assertEquals(1, tmp.count());
|
||||
assertEquals(4, tmp.count());
|
||||
|
||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
||||
verificationDataset.createOrReplaceTempView("publication");
|
||||
Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
|
||||
verificationDataset.createOrReplaceTempView("result");
|
||||
|
||||
Dataset<Row> execVerification = spark
|
||||
.sql(
|
||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
||||
"Select p.id oaid, mes.id, mUnit.value from result p " +
|
||||
"lateral view explode(measures) m as mes " +
|
||||
"lateral view explode(mes.unit) u as mUnit ");
|
||||
|
||||
Assertions.assertEquals(2, execVerification.count());
|
||||
|
||||
Assertions.assertEquals(12, execVerification.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
|
||||
execVerification.select("oaid").collectAsList().get(0).getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"1.47565045883e-08",
|
||||
execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0.227515392",
|
||||
execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void matchOneWithTwo() throws Exception {
|
||||
String bipScoresPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
||||
.getPath();
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
|
||||
.getPath();
|
||||
|
||||
SparkAtomicActionScoreJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
inputPath,
|
||||
"-bipScorePath",
|
||||
bipScoresPath,
|
||||
"-resultTableName",
|
||||
"eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Publication> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Publication) aa.getPayload()));
|
||||
|
||||
assertEquals(1, tmp.count());
|
||||
|
||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
||||
verificationDataset.createOrReplaceTempView("publication");
|
||||
|
||||
Dataset<Row> execVerification = spark
|
||||
.sql(
|
||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
||||
"lateral view explode(measures) m as mes " +
|
||||
"lateral view explode(mes.unit) u as mUnit ");
|
||||
|
||||
Assertions.assertEquals(4, execVerification.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
|
||||
execVerification.select("oaid").collectAsList().get(0).getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("id = 'influence'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("id = 'popularity'").count());
|
||||
|
||||
List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
|
||||
String tmp_influence = tmp_ds.get(0).getString(0);
|
||||
assertTrue(
|
||||
"1.47565045883e-08".equals(tmp_influence) ||
|
||||
"1.98956540239e-08".equals(tmp_influence));
|
||||
|
||||
tmp_influence = tmp_ds.get(1).getString(0);
|
||||
assertTrue(
|
||||
"1.47565045883e-08".equals(tmp_influence) ||
|
||||
"1.98956540239e-08".equals(tmp_influence));
|
||||
|
||||
assertNotEquals(tmp_ds.get(1).getString(0), tmp_ds.get(0).getString(0));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void matchTwo() throws Exception {
|
||||
String bipScoresPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
||||
.getPath();
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
|
||||
.getPath();
|
||||
|
||||
SparkAtomicActionScoreJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
inputPath,
|
||||
"-bipScorePath",
|
||||
bipScoresPath,
|
||||
"-resultTableName",
|
||||
"eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Publication> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Publication) aa.getPayload()));
|
||||
|
||||
assertEquals(2, tmp.count());
|
||||
|
||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
||||
verificationDataset.createOrReplaceTempView("publication");
|
||||
|
||||
Dataset<Row> execVerification = spark
|
||||
.sql(
|
||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
||||
"lateral view explode(measures) m as mes " +
|
||||
"lateral view explode(mes.unit) u as mUnit ");
|
||||
|
||||
Assertions.assertEquals(4, execVerification.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("id = 'influence'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("id = 'popularity'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"1.47565045883e-08",
|
||||
execVerification
|
||||
"6.63451994567e-09", execVerification
|
||||
.filter(
|
||||
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
|
||||
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||
"and id = 'influence'")
|
||||
.select("value")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"1.98956540239e-08",
|
||||
execVerification
|
||||
"0.348694533145", execVerification
|
||||
.filter(
|
||||
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
|
||||
"and id = 'influence'")
|
||||
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||
"and id = 'popularity_alt'")
|
||||
.select("value")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0.282046161584",
|
||||
execVerification
|
||||
"2.16094680115e-09", execVerification
|
||||
.filter(
|
||||
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
|
||||
"and id = 'popularity'")
|
||||
.select("value")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0.227515392",
|
||||
execVerification
|
||||
.filter(
|
||||
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
|
||||
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||
"and id = 'popularity'")
|
||||
.select("value")
|
||||
.collectAsList()
|
||||
|
|
|
@ -18,17 +18,14 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class PrepareTest {
|
||||
|
@ -96,13 +93,18 @@ public class PrepareTest {
|
|||
String doi1 = "unresolved::10.0000/096020199389707::doi";
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).count());
|
||||
Assertions.assertEquals(3, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getMeasures().size());
|
||||
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getInstance().size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getInstance().get(0).getMeasures().size());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"6.34596412687e-09", tmp
|
||||
.filter(r -> r.getId().equals(doi1))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(sl -> sl.getId().equals("influence"))
|
||||
|
@ -117,6 +119,8 @@ public class PrepareTest {
|
|||
.filter(r -> r.getId().equals(doi1))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(sl -> sl.getId().equals("popularity_alt"))
|
||||
|
@ -131,6 +135,8 @@ public class PrepareTest {
|
|||
.filter(r -> r.getId().equals(doi1))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.stream()
|
||||
.filter(sl -> sl.getId().equals("popularity"))
|
||||
|
@ -140,34 +146,10 @@ public class PrepareTest {
|
|||
.get(0)
|
||||
.getValue());
|
||||
|
||||
}
|
||||
final String doi2 = "unresolved::10.3390/s18072310::doi";
|
||||
|
||||
@Test
|
||||
void getFOSFileTest() throws IOException, ClassNotFoundException {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv")
|
||||
.getPath();
|
||||
final String outputPath = workingDir.toString() + "/fos.json";
|
||||
|
||||
new GetFOSData()
|
||||
.doRewrite(
|
||||
sourcePath, outputPath, "eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel",
|
||||
'\t', fs);
|
||||
|
||||
BufferedReader in = new BufferedReader(
|
||||
new InputStreamReader(fs.open(new org.apache.hadoop.fs.Path(outputPath))));
|
||||
|
||||
String line;
|
||||
int count = 0;
|
||||
while ((line = in.readLine()) != null) {
|
||||
FOSDataModel fos = new ObjectMapper().readValue(line, FOSDataModel.class);
|
||||
|
||||
System.out.println(new ObjectMapper().writeValueAsString(fos));
|
||||
count += 1;
|
||||
}
|
||||
|
||||
assertEquals(38, count);
|
||||
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count());
|
||||
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size());
|
||||
|
||||
}
|
||||
|
||||
|
@ -195,15 +177,8 @@ public class PrepareTest {
|
|||
|
||||
String doi1 = "unresolved::10.3390/s18072310::doi";
|
||||
|
||||
assertEquals(50, tmp.count());
|
||||
assertEquals(20, tmp.count());
|
||||
assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count());
|
||||
assertTrue(
|
||||
tmp
|
||||
.filter(r -> r.getId().equals(doi1))
|
||||
.flatMap(r -> r.getSubject().iterator())
|
||||
.map(sbj -> sbj.getValue())
|
||||
.collect()
|
||||
.contains("engineering and technology"));
|
||||
|
||||
assertTrue(
|
||||
tmp
|
||||
|
@ -211,16 +186,16 @@ public class PrepareTest {
|
|||
.flatMap(r -> r.getSubject().iterator())
|
||||
.map(sbj -> sbj.getValue())
|
||||
.collect()
|
||||
.contains("nano-technology"));
|
||||
.contains("04 agricultural and veterinary sciences"));
|
||||
assertTrue(
|
||||
tmp
|
||||
.filter(r -> r.getId().equals(doi1))
|
||||
.flatMap(r -> r.getSubject().iterator())
|
||||
.map(sbj -> sbj.getValue())
|
||||
.collect()
|
||||
.contains("nanoscience & nanotechnology"));
|
||||
.contains("0404 agricultural biotechnology"));
|
||||
|
||||
String doi = "unresolved::10.1111/1365-2656.12831::doi";
|
||||
String doi = "unresolved::10.1007/s11164-020-04383-6::doi";
|
||||
assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
|
||||
assertTrue(
|
||||
tmp
|
||||
|
@ -228,7 +203,7 @@ public class PrepareTest {
|
|||
.flatMap(r -> r.getSubject().iterator())
|
||||
.map(sbj -> sbj.getValue())
|
||||
.collect()
|
||||
.contains("psychology and cognitive sciences"));
|
||||
.contains("01 natural sciences"));
|
||||
|
||||
assertTrue(
|
||||
tmp
|
||||
|
@ -236,15 +211,114 @@ public class PrepareTest {
|
|||
.flatMap(r -> r.getSubject().iterator())
|
||||
.map(sbj -> sbj.getValue())
|
||||
.collect()
|
||||
.contains("social sciences"));
|
||||
assertFalse(
|
||||
.contains("0104 chemical sciences"));
|
||||
assertTrue(
|
||||
tmp
|
||||
.filter(r -> r.getId().equals(doi))
|
||||
.flatMap(r -> r.getSubject().iterator())
|
||||
.map(sbj -> sbj.getValue())
|
||||
.collect()
|
||||
.contains("NULL"));
|
||||
.contains("010402 general chemistry"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void sdgPrepareTest() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/sdg/sdg.json")
|
||||
.getPath();
|
||||
|
||||
PrepareSDGSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", sourcePath,
|
||||
|
||||
"-outputPath", workingDir.toString() + "/work"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Result> tmp = sc
|
||||
.textFile(workingDir.toString() + "/work/sdg")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
|
||||
|
||||
String doi1 = "unresolved::10.1001/amaguidesnewsletters.2019.sepoct02::doi";
|
||||
|
||||
assertEquals(32, tmp.count());
|
||||
assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count());
|
||||
|
||||
assertTrue(
|
||||
tmp
|
||||
.filter(r -> r.getId().equals(doi1))
|
||||
.flatMap(r -> r.getSubject().iterator())
|
||||
.map(sbj -> sbj.getValue())
|
||||
.collect()
|
||||
.contains("3. Good health"));
|
||||
assertTrue(
|
||||
tmp
|
||||
.filter(r -> r.getId().equals(doi1))
|
||||
.flatMap(r -> r.getSubject().iterator())
|
||||
.map(sbj -> sbj.getValue())
|
||||
.collect()
|
||||
.contains("8. Economic growth"));
|
||||
|
||||
Assertions.assertEquals(32, tmp.filter(row -> row.getDataInfo() != null).count());
|
||||
|
||||
}
|
||||
|
||||
// @Test
|
||||
// void test3() throws Exception {
|
||||
// final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_fos_results_20_12_2021.csv.gz";
|
||||
//
|
||||
// final String outputPath = workingDir.toString() + "/fos.json";
|
||||
// GetFOSSparkJob
|
||||
// .main(
|
||||
// new String[] {
|
||||
// "--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
// "--sourcePath", sourcePath,
|
||||
//
|
||||
// "-outputPath", outputPath
|
||||
//
|
||||
// });
|
||||
//
|
||||
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
//
|
||||
// JavaRDD<FOSDataModel> tmp = sc
|
||||
// .textFile(outputPath)
|
||||
// .map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
|
||||
//
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
|
||||
//
|
||||
// }
|
||||
//
|
||||
// @Test
|
||||
// void test4() throws Exception {
|
||||
// final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_sdg_results_20_12_21.csv.gz";
|
||||
//
|
||||
// final String outputPath = workingDir.toString() + "/sdg.json";
|
||||
// GetSDGSparkJob
|
||||
// .main(
|
||||
// new String[] {
|
||||
// "--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
// "--sourcePath", sourcePath,
|
||||
//
|
||||
// "-outputPath", outputPath
|
||||
//
|
||||
// });
|
||||
//
|
||||
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
//
|
||||
// JavaRDD<SDGDataModel> tmp = sc
|
||||
// .textFile(outputPath)
|
||||
// .map(item -> OBJECT_MAPPER.readValue(item, SDGDataModel.class));
|
||||
//
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
|
||||
// tmp.foreach(t -> Assertions.assertTrue(t.getSbj() != null));
|
||||
//
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.Constants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
|
@ -67,70 +68,12 @@ public class ProduceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void produceTest() throws Exception {
|
||||
void produceTestSubjects() throws Exception {
|
||||
|
||||
final String bipPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
|
||||
.getPath();
|
||||
|
||||
PrepareBipFinder
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", bipPath,
|
||||
"--outputPath", workingDir.toString() + "/work"
|
||||
|
||||
});
|
||||
final String fosPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
|
||||
.getPath();
|
||||
|
||||
PrepareFOSSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", fosPath,
|
||||
"-outputPath", workingDir.toString() + "/work"
|
||||
});
|
||||
|
||||
SparkSaveUnresolved.main(new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", workingDir.toString() + "/work",
|
||||
|
||||
"-outputPath", workingDir.toString() + "/unresolved"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Result> tmp = sc
|
||||
.textFile(workingDir.toString() + "/unresolved")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
|
||||
|
||||
Assertions.assertEquals(135, tmp.count());
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")).count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, tmp
|
||||
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, tmp
|
||||
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.size());
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
|
||||
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
||||
|
@ -172,14 +115,105 @@ public class ProduceTest {
|
|||
.assertEquals(
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
sbj.getDataInfo().getProvenanceaction().getSchemename()));
|
||||
}
|
||||
|
||||
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("engineering and technology"));
|
||||
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nano-technology"));
|
||||
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nanoscience & nanotechnology"));
|
||||
@Test
|
||||
void produceTestMeasuress() throws Exception {
|
||||
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
List<KeyValue> mes = tmp
|
||||
.filter(row -> row.getInstance() != null && row.getInstance().size() > 0)
|
||||
.flatMap(row -> row.getInstance().iterator())
|
||||
.flatMap(i -> i.getMeasures().iterator())
|
||||
.flatMap(m -> m.getUnit().iterator())
|
||||
.collect();
|
||||
|
||||
mes.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference()));
|
||||
mes.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred()));
|
||||
mes.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getInvisible()));
|
||||
mes.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust()));
|
||||
mes.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()));
|
||||
mes
|
||||
.forEach(
|
||||
sbj -> Assertions.assertEquals("measure:bip", sbj.getDataInfo().getProvenanceaction().getClassid()));
|
||||
mes
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname()));
|
||||
mes
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals(
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid()));
|
||||
mes
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals(
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
sbj.getDataInfo().getProvenanceaction().getSchemename()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void produceTest6Subjects() throws Exception {
|
||||
final String doi = "unresolved::10.3390/s18072310::doi";
|
||||
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
6, tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("04 agricultural and veterinary sciences")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0404 agricultural biotechnology")));
|
||||
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("040502 food science")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("03 medical and health sciences")));
|
||||
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0303 health sciences")));
|
||||
Assertions
|
||||
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("030309 nutrition & dietetics")));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void produceTest3Measures() throws Exception {
|
||||
final String doi = "unresolved::10.3390/s18072310::doi";
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.getMeasures()
|
||||
.size());
|
||||
|
||||
List<Measure> measures = tmp
|
||||
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
|
||||
.flatMap(row -> row.getMeasures().iterator())
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getInstance().iterator())
|
||||
.flatMap(inst -> inst.getMeasures().iterator())
|
||||
.collect();
|
||||
Assertions
|
||||
.assertEquals(
|
||||
|
@ -216,8 +250,82 @@ public class ProduceTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
49, tmp
|
||||
.filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi"))
|
||||
"10.3390/s18072310",
|
||||
tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.getPid()
|
||||
.get(0)
|
||||
.getValue()
|
||||
.toLowerCase());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"doi",
|
||||
tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.getPid()
|
||||
.get(0)
|
||||
.getQualifier()
|
||||
.getClassid());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"Digital Object Identifier",
|
||||
tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getInstance()
|
||||
.get(0)
|
||||
.getPid()
|
||||
.get(0)
|
||||
.getQualifier()
|
||||
.getClassname());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void produceTestMeasures() throws Exception {
|
||||
final String doi = "unresolved::10.3390/s18072310::doi";
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
List<StructuredProperty> mes = tmp
|
||||
.filter(row -> row.getInstance() != null && row.getInstance().size() > 0)
|
||||
.flatMap(row -> row.getInstance().iterator())
|
||||
.flatMap(i -> i.getPid().iterator())
|
||||
.collect();
|
||||
|
||||
Assertions.assertEquals(86, mes.size());
|
||||
|
||||
tmp
|
||||
.filter(row -> row.getInstance() != null && row.getInstance().size() > 0)
|
||||
.foreach(
|
||||
e -> Assertions.assertEquals("sysimport:enrich", e.getDataInfo().getProvenanceaction().getClassid()));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void produceTestSomeNumbers() throws Exception {
|
||||
|
||||
final String doi = "unresolved::10.3390/s18072310::doi";
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
Assertions.assertEquals(105, tmp.count());
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
19, tmp
|
||||
.filter(row -> !row.getId().equals(doi))
|
||||
.filter(row -> row.getSubject() != null)
|
||||
.count());
|
||||
|
||||
|
@ -225,10 +333,254 @@ public class ProduceTest {
|
|||
.assertEquals(
|
||||
85,
|
||||
tmp
|
||||
.filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi"))
|
||||
.filter(r -> r.getMeasures() != null)
|
||||
.filter(row -> !row.getId().equals(doi))
|
||||
.filter(r -> r.getInstance() != null && r.getInstance().size() > 0)
|
||||
.count());
|
||||
|
||||
}
|
||||
|
||||
private JavaRDD<Result> getResultJavaRDD() throws Exception {
|
||||
final String bipPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
|
||||
.getPath();
|
||||
|
||||
PrepareBipFinder
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", bipPath,
|
||||
"--outputPath", workingDir.toString() + "/work"
|
||||
|
||||
});
|
||||
final String fosPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
|
||||
.getPath();
|
||||
|
||||
PrepareFOSSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", fosPath,
|
||||
"-outputPath", workingDir.toString() + "/work"
|
||||
});
|
||||
|
||||
SparkSaveUnresolved.main(new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", workingDir.toString() + "/work",
|
||||
|
||||
"-outputPath", workingDir.toString() + "/unresolved"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
return sc
|
||||
.textFile(workingDir.toString() + "/unresolved")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
void prepareTest5Subjects() throws Exception {
|
||||
final String doi = "unresolved::10.1063/5.0032658::doi";
|
||||
|
||||
JavaRDD<Result> tmp = getResultJavaRDD();
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
5, tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("01 natural sciences")));
|
||||
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0103 physical sciences")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("010304 chemical physics")));
|
||||
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0104 chemical sciences")));
|
||||
Assertions
|
||||
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("010402 general chemistry")));
|
||||
|
||||
}
|
||||
|
||||
private JavaRDD<Result> getResultJavaRDDPlusSDG() throws Exception {
|
||||
final String bipPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
|
||||
.getPath();
|
||||
|
||||
PrepareBipFinder
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", bipPath,
|
||||
"--outputPath", workingDir.toString() + "/work"
|
||||
|
||||
});
|
||||
final String fosPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
|
||||
.getPath();
|
||||
|
||||
PrepareFOSSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", fosPath,
|
||||
"-outputPath", workingDir.toString() + "/work"
|
||||
});
|
||||
|
||||
final String sdgPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/sdg/sdg.json")
|
||||
.getPath();
|
||||
|
||||
PrepareSDGSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", sdgPath,
|
||||
"-outputPath", workingDir.toString() + "/work"
|
||||
});
|
||||
|
||||
SparkSaveUnresolved.main(new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--sourcePath", workingDir.toString() + "/work",
|
||||
|
||||
"-outputPath", workingDir.toString() + "/unresolved"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
return sc
|
||||
.textFile(workingDir.toString() + "/unresolved")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
void produceTestSomeNumbersWithSDG() throws Exception {
|
||||
|
||||
final String doi = "unresolved::10.3390/s18072310::doi";
|
||||
JavaRDD<Result> tmp = getResultJavaRDDPlusSDG();
|
||||
|
||||
Assertions.assertEquals(136, tmp.count());
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
50, tmp
|
||||
.filter(row -> !row.getId().equals(doi))
|
||||
.filter(row -> row.getSubject() != null)
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
85,
|
||||
tmp
|
||||
.filter(row -> !row.getId().equals(doi))
|
||||
.filter(r -> r.getInstance() != null && r.getInstance().size() > 0)
|
||||
.count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void produceTest7Subjects() throws Exception {
|
||||
final String doi = "unresolved::10.3390/s18072310::doi";
|
||||
|
||||
JavaRDD<Result> tmp = getResultJavaRDDPlusSDG();
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
7, tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
|
||||
List<StructuredProperty> sbjs = tmp
|
||||
.filter(row -> row.getId().equals(doi))
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.collect();
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("04 agricultural and veterinary sciences")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0404 agricultural biotechnology")));
|
||||
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("040502 food science")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("03 medical and health sciences")));
|
||||
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0303 health sciences")));
|
||||
Assertions
|
||||
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("030309 nutrition & dietetics")));
|
||||
Assertions
|
||||
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("1. No poverty")));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void produceTestSubjectsWithSDG() throws Exception {
|
||||
|
||||
JavaRDD<Result> tmp = getResultJavaRDDPlusSDG();
|
||||
|
||||
List<StructuredProperty> sbjs_sdg = tmp
|
||||
.filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
|
||||
.flatMap(row -> row.getSubject().iterator())
|
||||
.filter(sbj -> sbj.getQualifier().getClassid().equals(Constants.SDG_CLASS_ID))
|
||||
.collect();
|
||||
|
||||
sbjs_sdg.forEach(sbj -> Assertions.assertEquals("SDG", sbj.getQualifier().getClassid()));
|
||||
sbjs_sdg
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals(
|
||||
"Sustainable Development Goals", sbj.getQualifier().getClassname()));
|
||||
sbjs_sdg
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemeid()));
|
||||
sbjs_sdg
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemename()));
|
||||
|
||||
sbjs_sdg.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference()));
|
||||
sbjs_sdg.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred()));
|
||||
sbjs_sdg.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getInvisible()));
|
||||
sbjs_sdg.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust()));
|
||||
sbjs_sdg.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()));
|
||||
sbjs_sdg
|
||||
.forEach(
|
||||
sbj -> Assertions.assertEquals("subject:sdg", sbj.getDataInfo().getProvenanceaction().getClassid()));
|
||||
sbjs_sdg
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname()));
|
||||
sbjs_sdg
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals(
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid()));
|
||||
sbjs_sdg
|
||||
.forEach(
|
||||
sbj -> Assertions
|
||||
.assertEquals(
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
sbj.getDataInfo().getProvenanceaction().getSchemename()));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -76,7 +76,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -89,17 +89,17 @@ public class CreateOpenCitationsASTest {
|
|||
"-inputPath",
|
||||
inputPath,
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
workingDir.toString() + "/actionSet1"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
assertEquals(60, tmp.count());
|
||||
assertEquals(62, tmp.count());
|
||||
|
||||
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
|
||||
|
||||
|
@ -110,7 +110,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -121,17 +121,17 @@ public class CreateOpenCitationsASTest {
|
|||
"-inputPath",
|
||||
inputPath,
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
workingDir.toString() + "/actionSet2"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.sequenceFile(workingDir.toString() + "/actionSet2", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
assertEquals(44, tmp.count());
|
||||
assertEquals(46, tmp.count());
|
||||
|
||||
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
|
||||
|
||||
|
@ -142,7 +142,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -153,13 +153,13 @@ public class CreateOpenCitationsASTest {
|
|||
"-inputPath",
|
||||
inputPath,
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
workingDir.toString() + "/actionSet3"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.sequenceFile(workingDir.toString() + "/actionSet3", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
|
@ -175,7 +175,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -186,13 +186,13 @@ public class CreateOpenCitationsASTest {
|
|||
"-inputPath",
|
||||
inputPath,
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
workingDir.toString() + "/actionSet4"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.sequenceFile(workingDir.toString() + "/actionSet4", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
|
@ -215,7 +215,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -226,13 +226,13 @@ public class CreateOpenCitationsASTest {
|
|||
"-inputPath",
|
||||
inputPath,
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
workingDir.toString() + "/actionSet5"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.sequenceFile(workingDir.toString() + "/actionSet5", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
|
@ -240,8 +240,8 @@ public class CreateOpenCitationsASTest {
|
|||
assertEquals("citation", r.getSubRelType());
|
||||
assertEquals("resultResult", r.getRelType());
|
||||
});
|
||||
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
|
||||
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
|
||||
assertEquals(23, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
|
||||
assertEquals(23, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
|
||||
|
||||
}
|
||||
|
||||
|
@ -250,7 +250,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -261,13 +261,13 @@ public class CreateOpenCitationsASTest {
|
|||
"-inputPath",
|
||||
inputPath,
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
workingDir.toString() + "/actionSet6"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.sequenceFile(workingDir.toString() + "/actionSet6", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
|
@ -295,7 +295,7 @@ public class CreateOpenCitationsASTest {
|
|||
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/COCI")
|
||||
.getPath();
|
||||
|
||||
CreateActionSetSparkJob
|
||||
|
@ -306,13 +306,13 @@ public class CreateOpenCitationsASTest {
|
|||
"-inputPath",
|
||||
inputPath,
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
workingDir.toString() + "/actionSet7"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Relation> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.sequenceFile(workingDir.toString() + "/actionSet7", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
|
|
|
@ -0,0 +1,140 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.opencitations;
|
||||
|
||||
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocalFileSystem;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
|
||||
public class ReadCOCITest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(ReadCOCITest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(ReadCOCITest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(ReadCOCITest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(ReadCOCITest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void testReadCOCI() throws Exception {
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
|
||||
.getPath();
|
||||
|
||||
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input1.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input2.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input3.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input4.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input4.gz"));
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input5.gz")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir + "/COCI/input5.gz"));
|
||||
|
||||
ReadCOCI
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-workingPath",
|
||||
workingDir.toString() + "/COCI",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/COCI_json/",
|
||||
"-inputFile", "input1;input2;input3;input4;input5"
|
||||
});
|
||||
|
||||
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<COCI> tmp = sc
|
||||
.textFile(workingDir.toString() + "/COCI_json/*/")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, COCI.class));
|
||||
|
||||
Assertions.assertEquals(24, tmp.count());
|
||||
|
||||
Assertions.assertEquals(1, tmp.filter(c -> c.getCiting().equals("10.1207/s15327647jcd3,4-01")).count());
|
||||
|
||||
Assertions.assertEquals(8, tmp.filter(c -> c.getCiting().indexOf(".refs") > -1).count());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,56 +0,0 @@
|
|||
package eu.dnetlib.dhp.datacite
|
||||
|
||||
|
||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import org.junit.jupiter.api.extension.ExtendWith
|
||||
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||
import org.mockito.junit.jupiter.MockitoExtension
|
||||
|
||||
import java.text.SimpleDateFormat
|
||||
import java.util.Locale
|
||||
import scala.io.Source
|
||||
|
||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||
class DataciteToOAFTest extends AbstractVocabularyTest{
|
||||
|
||||
|
||||
@BeforeEach
|
||||
def setUp() :Unit = {
|
||||
|
||||
super.setUpVocabulary()
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testDateMapping:Unit = {
|
||||
val inputDate = "2021-07-14T11:52:54+0000"
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
val dt = ISO8601FORMAT.parse(inputDate)
|
||||
println(dt.getTime)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testMapping() :Unit = {
|
||||
val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString
|
||||
|
||||
|
||||
|
||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
|
||||
|
||||
res.foreach(r => {
|
||||
println (mapper.writeValueAsString(r))
|
||||
println("----------------------------")
|
||||
|
||||
})
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -1,190 +0,0 @@
|
|||
package eu.dnetlib.dhp.sx.bio
|
||||
|
||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
|
||||
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.junit.jupiter.api.Assertions._
|
||||
import org.junit.jupiter.api.extension.ExtendWith
|
||||
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||
import org.mockito.junit.jupiter.MockitoExtension
|
||||
|
||||
import java.io.{BufferedReader, InputStream, InputStreamReader}
|
||||
import java.util.zip.GZIPInputStream
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
import scala.xml.pull.XMLEventReader
|
||||
|
||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||
class BioScholixTest extends AbstractVocabularyTest{
|
||||
|
||||
|
||||
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES,false)
|
||||
|
||||
@BeforeEach
|
||||
def setUp() :Unit = {
|
||||
|
||||
super.setUpVocabulary()
|
||||
}
|
||||
|
||||
class BufferedReaderIterator(reader: BufferedReader) extends Iterator[String] {
|
||||
override def hasNext() = reader.ready
|
||||
override def next() = reader.readLine()
|
||||
}
|
||||
|
||||
object GzFileIterator {
|
||||
def apply(is: InputStream, encoding: String) = {
|
||||
new BufferedReaderIterator(
|
||||
new BufferedReader(
|
||||
new InputStreamReader(
|
||||
new GZIPInputStream(
|
||||
is), encoding)))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def testEBIData() = {
|
||||
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
|
||||
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
|
||||
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testPubmedToOaf(): Unit = {
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
|
||||
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
assertEquals(10, r.size)
|
||||
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))
|
||||
println(mapper.writeValueAsString(r.head))
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testPDBToOAF():Unit = {
|
||||
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")).mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
||||
|
||||
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertNotNull(r))
|
||||
|
||||
println(result.count(o => o.isInstanceOf[Relation]))
|
||||
println(mapper.writeValueAsString(result.head))
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testUNIprotToOAF():Unit = {
|
||||
|
||||
assertNotNull(vocabularies)
|
||||
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")).mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result:List[Oaf]= records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
||||
|
||||
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertNotNull(r))
|
||||
|
||||
println(result.count(o => o.isInstanceOf[Relation]))
|
||||
println(mapper.writeValueAsString(result.head))
|
||||
|
||||
}
|
||||
|
||||
case class EBILinks(relType:String, date:String, title:String, pmid:String, targetPid:String, targetPidType:String) {}
|
||||
|
||||
def parse_ebi_links(input:String):List[EBILinks] ={
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
val pmid = (json \ "publication" \"pmid").extract[String]
|
||||
for {
|
||||
JObject(link) <- json \\ "Link"
|
||||
JField("Target",JObject(target)) <- link
|
||||
JField("RelationshipType",JObject(relType)) <- link
|
||||
JField("Name", JString(relation)) <- relType
|
||||
JField("PublicationDate",JString(publicationDate)) <- link
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Identifier",JObject(identifier)) <- target
|
||||
JField("IDScheme", JString(idScheme)) <- identifier
|
||||
JField("ID", JString(id)) <- identifier
|
||||
|
||||
} yield EBILinks(relation, publicationDate, title, pmid, id, idScheme)
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def testCrossrefLinksToOAF():Unit = {
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")).mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
|
||||
val result:List[Oaf] =records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
||||
|
||||
assertNotNull(result)
|
||||
assertTrue(result.nonEmpty)
|
||||
|
||||
println(mapper.writeValueAsString(result.head))
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def testEBILinksToOAF():Unit = {
|
||||
val iterator = GzFileIterator(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/ebi_links.gz"), "UTF-8")
|
||||
val data = iterator.next()
|
||||
|
||||
val res = BioDBToOAF.parse_ebi_links(BioDBToOAF.extractEBILinksFromDump(data).links).filter(BioDBToOAF.EBITargetLinksFilter).flatMap(BioDBToOAF.convertEBILinksToOaf)
|
||||
print(res.length)
|
||||
|
||||
|
||||
println(mapper.writeValueAsString(res.head))
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
def scholixResolvedToOAF():Unit ={
|
||||
|
||||
val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")).mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
val l:List[ScholixResolved] = records.lines.map{input =>
|
||||
lazy val json = parse(input)
|
||||
json.extract[ScholixResolved]
|
||||
}.toList
|
||||
|
||||
|
||||
val result:List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s))
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,4 @@
|
|||
{"50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1": [{"id": "influence", "unit": [{"value": "6.63451994567e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.348694533145", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.16094680115e-09", "key": "score"}]}]}
|
||||
{"50|dedup_wf_001::05b1f8ce98702f69d07aa5f0429de1e3": [{"id": "influence", "unit": [{"value": "6.25057357279e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "7.0208", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.40234462244e-08", "key": "score"}]}]}
|
||||
{"50|dedup_wf_001::08823c8f5c3ca2eae523817036cdda67": [{"id": "influence", "unit": [{"value": "5.54921449123e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.53012887452e-10", "key": "score"}]}]}
|
||||
{"50|dedup_wf_001::0e72b399325d6efcbe3271891a1dfe4c": [{"id": "influence", "unit": [{"value": "1.63466096315e-08", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "20.9870879741", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.49501495323e-08", "key": "score"}]}]}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,38 +1,39 @@
|
|||
{"doi":"10.3390/s18072310","level1":"engineering and technology","level2":"nano-technology","level3":"nanoscience & nanotechnology"}
|
||||
{"doi":"10.1111/1365-2656.12831\u000210.17863/cam.24369","level1":"social sciences","level2":"psychology and cognitive sciences","level3":"NULL"}
|
||||
{"doi":"10.3929/ethz-b-000187584\u000210.1002/chem.201701644","level1":"natural sciences","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.1080/01913123.2017.1367361","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
|
||||
{"doi":"10.1051/e3sconf/20199207011","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"environmental sciences"}
|
||||
{"doi":"10.1038/onc.2015.333","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
|
||||
{"doi":"10.1093/mnras/staa256","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
|
||||
{"doi":"10.1016/j.jclepro.2018.07.166","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
|
||||
{"doi":"10.1103/physrevlett.125.037403","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
|
||||
{"doi":"10.1080/03602532.2017.1316285","level1":"natural sciences","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.1001/jamanetworkopen.2019.1868","level1":"medical and health sciences","level2":"other medical science","level3":"health policy & services"}
|
||||
{"doi":"10.1128/mra.00874-18","level1":"natural sciences","level2":"biological sciences","level3":"plant biology & botany"}
|
||||
{"doi":"10.1016/j.nancom.2018.03.001","level1":"engineering and technology","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.1112/topo.12174","level1":"natural sciences","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.12688/wellcomeopenres.15846.1","level1":"medical and health sciences","level2":"health sciences","level3":"NULL"}
|
||||
{"doi":"10.21468/scipostphys.3.1.001","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
|
||||
{"doi":"10.1088/1741-4326/ab6c77","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
|
||||
{"doi":"10.1109/tpwrs.2019.2944747","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
|
||||
{"doi":"10.1016/j.expthermflusci.2019.109994\u000210.17863/cam.46212","level1":"engineering and technology","level2":"mechanical engineering","level3":"mechanical engineering & transports"}
|
||||
{"doi":"10.1109/tc.2018.2860012","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"computer hardware & architecture"}
|
||||
{"doi":"10.1002/mma.6622","level1":"natural sciences","level2":"mathematics","level3":"numerical & computational mathematics"}
|
||||
{"doi":"10.1051/radiopro/2020020","level1":"natural sciences","level2":"chemical sciences","level3":"NULL"}
|
||||
{"doi":"10.1007/s12268-019-1003-4","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
|
||||
{"doi":"10.3390/cancers12010236","level1":"medical and health sciences","level2":"health sciences","level3":"biochemistry & molecular biology"}
|
||||
{"doi":"10.6084/m9.figshare.9912614\u000210.6084/m9.figshare.9912614.v1\u000210.1080/00268976.2019.1665199","level1":"natural sciences","level2":"chemical sciences","level3":"physical chemistry"}
|
||||
{"doi":"10.1175/jpo-d-17-0239.1","level1":"natural sciences","level2":"biological sciences","level3":"marine biology & hydrobiology"}
|
||||
{"doi":"10.1007/s13218-020-00674-7","level1":"engineering and technology","level2":"industrial biotechnology","level3":"industrial engineering & automation"}
|
||||
{"doi":"10.1016/j.psyneuen.2016.02.003\u000210.1016/j.psyneuen.2016.02.00310.7892/boris.78886\u000210.7892/boris.78886","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
|
||||
{"doi":"10.1109/ted.2018.2813542","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
|
||||
{"doi":"10.3989/scimar.04739.25a","level1":"natural sciences","level2":"biological sciences","level3":"NULL"}
|
||||
{"doi":"10.3390/su12187503","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"NULL"}
|
||||
{"doi":"10.1016/j.ccell.2018.08.017","level1":"medical and health sciences","level2":"basic medicine","level3":"biochemistry & molecular biology"}
|
||||
{"doi":"10.1103/physrevresearch.2.023322","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
|
||||
{"doi":"10.1039/c8cp03234c","level1":"natural sciences","level2":"NULL","level3":"NULL"}
|
||||
{"doi":"10.5281/zenodo.3696557\u000210.5281/zenodo.3696556\u000210.1109/jsac.2016.2545384","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"networking & telecommunications"}
|
||||
{"doi":"10.1038/ng.3667\u000210.1038/ng.3667.\u000210.17615/tct6-4m26\u000210.17863/cam.15649","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"}
|
||||
{"doi":"10.1016/j.jclepro.2019.119065","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
|
||||
{"doi":"10.1111/pce.13392","level1":"agricultural and veterinary sciences","level2":"agriculture, forestry, and fisheries","level3":"agronomy & agriculture"}
|
||||
{"doi":"10.1080/1536383x.2020.1868997","level1":"02 engineering and technology","level2":"0210 nano-technology","level3":"021001 nanoscience & nanotechnology"}
|
||||
{"doi":"10.1080/1536383x.2020.1868997","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry"}
|
||||
{"doi":"10.1186/s40425-019-0732-8","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis"}
|
||||
{"doi":"10.1186/s40425-019-0732-8","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1007/s10482-021-01529-3","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1007/s10482-021-01529-3","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030306 microbiology"}
|
||||
{"doi":"10.1155/2021/6643273","level1":"01 natural sciences","level2":"0103 physical sciences","level3":"010301 acoustics"}
|
||||
{"doi":"10.1155/2021/6643273","level1":"02 engineering and technology","level2":"0209 industrial biotechnology","level3":"020901 industrial engineering & automation"}
|
||||
{"doi":"10.12737/article_5d6613dbf2ad51.82646096","level1":"02 engineering and technology","level2":"0210 nano-technology","level3":"021001 nanoscience & nanotechnology"}
|
||||
{"doi":"10.12737/article_5d6613dbf2ad51.82646096","level1":"01 natural sciences","level2":"0103 physical sciences","level3":"010302 applied physics"}
|
||||
{"doi":"10.1216/jie.2020.32.457","level1":"01 natural sciences","level2":"0101 mathematics","level3":"010101 applied mathematics"}
|
||||
{"doi":"10.1216/jie.2020.32.457","level1":"01 natural sciences","level2":"0101 mathematics","level3":"010102 general mathematics"}
|
||||
{"doi":"10.3934/naco.2021021","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021103 operations research"}
|
||||
{"doi":"10.3934/naco.2021021","level1":"02 engineering and technology","level2":"0209 industrial biotechnology","level3":"020901 industrial engineering & automation"}
|
||||
{"doi":"10.1080/1034912x.2021.1910933","level1":"05 social sciences","level2":"050301 education","level3":"050301 education"}
|
||||
{"doi":"10.1080/1034912x.2021.1910933","level1":"05 social sciences","level2":"0501 psychology and cognitive sciences","level3":"050104 developmental & child psychology"}
|
||||
{"doi":"10.1016/j.rtbm.2020.100596","level1":"05 social sciences","level2":"0502 economics and business","level3":"050211 marketing"}
|
||||
{"doi":"10.1016/j.rtbm.2020.100596","level1":"05 social sciences","level2":"0502 economics and business","level3":"050212 sport, leisure & tourism"}
|
||||
{"doi":"10.14807/ijmp.v11i8.1220","level1":"05 social sciences","level2":"0502 economics and business","level3":"050211 marketing"}
|
||||
{"doi":"10.14807/ijmp.v11i8.1220","level1":"05 social sciences","level2":"0502 economics and business","level3":"050203 business & management"}
|
||||
{"doi":"10.1007/s13205-020-02415-x","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1007/s13205-020-02415-x","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030302 biochemistry & molecular biology"}
|
||||
{"doi":"10.3390/s18072310","level1":"04 agricultural and veterinary sciences","level2":"0404 agricultural biotechnology","level3":"040502 food science"}
|
||||
{"doi":"10.3390/s18072310","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030309 nutrition & dietetics"}
|
||||
{"doi":"10.1063/5.0032658","level1":"01 natural sciences","level2":"0103 physical sciences","level3":"010304 chemical physics"}
|
||||
{"doi":"10.1063/5.0032658","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry"}
|
||||
{"doi":"10.1145/3411174.3411195","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020201 artificial intelligence & image processing"}
|
||||
{"doi":"10.1145/3411174.3411195","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020206 networking & telecommunications"}
|
||||
{"doi":"10.1021/acs.joc.0c02755","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010405 organic chemistry"}
|
||||
{"doi":"10.1021/acs.joc.0c02755","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry"}
|
||||
{"doi":"10.1002/jcp.28608","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis"}
|
||||
{"doi":"10.1002/jcp.28608","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1097/cmr.0000000000000579","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis"}
|
||||
{"doi":"10.1097/cmr.0000000000000579","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030304 developmental biology"}
|
||||
{"doi":"10.1007/s11164-020-04383-6","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010405 organic chemistry"}
|
||||
{"doi":"10.1007/s11164-020-04383-6","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry"}
|
||||
{"doi":"10.1016/j.actpsy.2020.103155","level1":"05 social sciences","level2":"0501 psychology and cognitive sciences","level3":"050105 experimental psychology"}
|
||||
{"doi":"10.1016/j.actpsy.2020.103155","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030217 neurology & neurosurgery"}
|
||||
{"doi":"10.1109/memea49120.2020.9137187","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020208 electrical & electronic engineering"}
|
|
@ -0,0 +1,39 @@
|
|||
10.1080/1536383x.2020.1868997,02 engineering and technology,0210 nano-technology,021001 nanoscience & nanotechnology
|
||||
10.1080/1536383x.2020.1868997,01 natural sciences,0104 chemical sciences,010402 general chemistry
|
||||
10.1186/s40425-019-0732-8,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis
|
||||
10.1186/s40425-019-0732-8,03 medical and health sciences,0301 basic medicine,030304 developmental biology
|
||||
10.1007/s10482-021-01529-3,03 medical and health sciences,0301 basic medicine,030304 developmental biology
|
||||
10.1007/s10482-021-01529-3,03 medical and health sciences,0301 basic medicine,030306 microbiology
|
||||
10.1155/2021/6643273,01 natural sciences,0103 physical sciences,010301 acoustics
|
||||
10.1155/2021/6643273,02 engineering and technology,0209 industrial biotechnology,020901 industrial engineering & automation
|
||||
10.12737/article_5d6613dbf2ad51.82646096,02 engineering and technology,0210 nano-technology,021001 nanoscience & nanotechnology
|
||||
10.12737/article_5d6613dbf2ad51.82646096,01 natural sciences,0103 physical sciences,010302 applied physics
|
||||
10.1216/jie.2020.32.457,01 natural sciences,0101 mathematics,010101 applied mathematics
|
||||
10.1216/jie.2020.32.457,01 natural sciences,0101 mathematics,010102 general mathematics
|
||||
10.3934/naco.2021021,02 engineering and technology,0211 other engineering and technologies,021103 operations research
|
||||
10.3934/naco.2021021,02 engineering and technology,0209 industrial biotechnology,020901 industrial engineering & automation
|
||||
10.1080/1034912x.2021.1910933,05 social sciences,050301 education,050301 education
|
||||
10.1080/1034912x.2021.1910933,05 social sciences,0501 psychology and cognitive sciences,050104 developmental & child psychology
|
||||
10.1016/j.rtbm.2020.100596,05 social sciences,0502 economics and business,050211 marketing
|
||||
10.1016/j.rtbm.2020.100596,05 social sciences,0502 economics and business,"050212 sport, leisure & tourism"
|
||||
10.14807/ijmp.v11i8.1220,05 social sciences,0502 economics and business,050211 marketing
|
||||
10.14807/ijmp.v11i8.1220,05 social sciences,0502 economics and business,050203 business & management
|
||||
10.1007/s13205-020-02415-x,03 medical and health sciences,0303 health sciences,030304 developmental biology
|
||||
10.1007/s13205-020-02415-x,03 medical and health sciences,0303 health sciences,030302 biochemistry & molecular biology
|
||||
10.3390/foods10040865,04 agricultural and veterinary sciences,0404 agricultural biotechnology,040502 food science
|
||||
10.3390/foods10040865,03 medical and health sciences,0303 health sciences,030309 nutrition & dietetics
|
||||
10.1063/5.0032658,01 natural sciences,0103 physical sciences,010304 chemical physics
|
||||
10.1063/5.0032658,01 natural sciences,0104 chemical sciences,010402 general chemistry
|
||||
10.1145/3411174.3411195,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020201 artificial intelligence & image processing
|
||||
10.1145/3411174.3411195,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020206 networking & telecommunications
|
||||
10.1021/acs.joc.0c02755,01 natural sciences,0104 chemical sciences,010405 organic chemistry
|
||||
10.1021/acs.joc.0c02755,01 natural sciences,0104 chemical sciences,010402 general chemistry
|
||||
10.1002/jcp.28608,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis
|
||||
10.1002/jcp.28608,03 medical and health sciences,0301 basic medicine,030304 developmental biology
|
||||
10.1097/cmr.0000000000000579,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis
|
||||
10.1097/cmr.0000000000000579,03 medical and health sciences,0301 basic medicine,030304 developmental biology
|
||||
10.1007/s11164-020-04383-6,01 natural sciences,0104 chemical sciences,010405 organic chemistry
|
||||
10.1007/s11164-020-04383-6,01 natural sciences,0104 chemical sciences,010402 general chemistry
|
||||
10.1016/j.actpsy.2020.103155,05 social sciences,0501 psychology and cognitive sciences,050105 experimental psychology
|
||||
10.1016/j.actpsy.2020.103155,03 medical and health sciences,0302 clinical medicine,030217 neurology & neurosurgery
|
||||
10.1109/memea49120.2020.9137187,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020208 electrical & electronic engineering
|
|
|
@ -1,38 +0,0 @@
|
|||
dedup_wf_001::ddcc7a56fa13e49bcc59c6bdd19ad26c 10.3390/s18072310 engineering and technology nano-technology nanoscience & nanotechnology
|
||||
dedup_wf_001::b76062d56e28224eac56111a4e1e5ecf 10.1111/1365-2656.1283110.17863/cam.24369 social sciences psychology and cognitive sciences NULL
|
||||
dedup_wf_001::bb752acb8f403a25fa7851a302f7b7ac 10.3929/ethz-b-00018758410.1002/chem.201701644 natural sciences NULL NULL
|
||||
dedup_wf_001::2f1435a9201ecf5cbbcb12c9b2d971cd 10.1080/01913123.2017.1367361 medical and health sciences clinical medicine oncology & carcinogenesis
|
||||
dedup_wf_001::fc9e47ec16c67b101724320d4b030514 10.1051/e3sconf/20199207011 natural sciences earth and related environmental sciences environmental sciences
|
||||
dedup_wf_001::caa1e5b4de387cb31751552f4f0f5d72 10.1038/onc.2015.333 medical and health sciences clinical medicine oncology & carcinogenesis
|
||||
dedup_wf_001::c2a98df5637d69bf0524eaf40fe6bf11 10.1093/mnras/staa256 natural sciences physical sciences NULL
|
||||
dedup_wf_001::c221262bdc77cbfd59859a402f0e3991 10.1016/j.jclepro.2018.07.166 engineering and technology other engineering and technologies building & construction
|
||||
doiboost____::d56d9dc21f317b3e009d5b6c8ea87212 10.1103/physrevlett.125.037403 natural sciences physical sciences nuclear & particles physics
|
||||
dedup_wf_001::8a7269c8ee6470b2fb4fd384bc389e08 10.1080/03602532.2017.1316285 natural sciences NULL NULL
|
||||
dedup_wf_001::28342ebbc19833e4e1f4a2b23cf5ee20 10.1001/jamanetworkopen.2019.1868 medical and health sciences other medical science health policy & services
|
||||
dedup_wf_001::c1e1daf2b55dd9ec8e1c7c7458bbc7bc 10.1128/mra.00874-18 natural sciences biological sciences plant biology & botany
|
||||
dedup_wf_001::a2ef4a2720c71907180750e5871298ef 10.1016/j.nancom.2018.03.001 engineering and technology NULL NULL
|
||||
dedup_wf_001::676f46a31519e83a89efcb1c626286fb 10.1112/topo.12174 natural sciences NULL NULL
|
||||
dedup_wf_001::6f2761642f1e39313388e2c4060657dd 10.12688/wellcomeopenres.15846.1 medical and health sciences health sciences NULL
|
||||
dedup_wf_001::e414c1dec599521a9635a60de0f6755b 10.21468/scipostphys.3.1.001 natural sciences physical sciences NULL
|
||||
dedup_wf_001::f3395fe0f330164ea424dc61c86c9a3d 10.1088/1741-4326/ab6c77 natural sciences physical sciences nuclear & particles physics
|
||||
dedup_wf_001::a4f32a97a783117012f1de11797e73f2 10.1109/tpwrs.2019.2944747 engineering and technology electrical engineering, electronic engineering, information engineering electrical & electronic engineering
|
||||
dedup_wf_001::313ae1cd083ae1696d12dd1909f97df8 10.1016/j.expthermflusci.2019.10999410.17863/cam.46212 engineering and technology mechanical engineering mechanical engineering & transports
|
||||
dedup_wf_001::2a300a7d3ca7347791ebcef986bc0682 10.1109/tc.2018.2860012 engineering and technology electrical engineering, electronic engineering, information engineering computer hardware & architecture
|
||||
doiboost____::5b79bd7bd9f87361b4a4abc3cbb2df75 10.1002/mma.6622 natural sciences mathematics numerical & computational mathematics
|
||||
dedup_wf_001::6a3f61f217a2519fbaddea1094e3bfc2 10.1051/radiopro/2020020 natural sciences chemical sciences NULL
|
||||
dedup_wf_001::a3f0430309a639f4234a0e57b10f2dee 10.1007/s12268-019-1003-4 medical and health sciences basic medicine NULL
|
||||
dedup_wf_001::b6b8a3a1cccbee459cf3343485efdb12 10.3390/cancers12010236 medical and health sciences health sciences biochemistry & molecular biology
|
||||
dedup_wf_001::dd06ee7974730e7b09a4f03c83b3f9bd 10.6084/m9.figshare.991261410.6084/m9.figshare.9912614.v110.1080/00268976.2019.1665199 natural sciences chemical sciences physical chemistry
|
||||
dedup_wf_001::027c78bef6f972b5e26dfea55d30fbe3 10.1175/jpo-d-17-0239.1 natural sciences biological sciences marine biology & hydrobiology
|
||||
dedup_wf_001::43edc179aa9e1fbaf582c5203b18b519 10.1007/s13218-020-00674-7 engineering and technology industrial biotechnology industrial engineering & automation
|
||||
dedup_wf_001::e7770e11cd6eb514bb52c07b5a8a80f0 10.1016/j.psyneuen.2016.02.00310.1016/j.psyneuen.2016.02.00310.7892/boris.7888610.7892/boris.78886 medical and health sciences basic medicine NULL
|
||||
dedup_wf_001::80bc15d69bdc589149631f3439dde5aa 10.1109/ted.2018.2813542 engineering and technology electrical engineering, electronic engineering, information engineering electrical & electronic engineering
|
||||
dedup_wf_001::42c1cfa33e7872944b920cff90f4d99e 10.3989/scimar.04739.25a natural sciences biological sciences NULL
|
||||
dedup_wf_001::9bacdbbaa9da3658b7243d5de8e3ce14 10.3390/su12187503 natural sciences earth and related environmental sciences NULL
|
||||
dedup_wf_001::59e43d3527dcfecb6097fbd5740c8950 10.1016/j.ccell.2018.08.017 medical and health sciences basic medicine biochemistry & molecular biology
|
||||
doiboost____::e024d1b738df3b24bc58fa0228542571 10.1103/physrevresearch.2.023322 natural sciences physical sciences nuclear & particles physics
|
||||
dedup_wf_001::66e9a3237fa8178886d26d3c2d5b9e66 10.1039/c8cp03234c natural sciences NULL NULL
|
||||
dedup_wf_001::83737ab4205bae751571bb3b166efa18 10.5281/zenodo.369655710.5281/zenodo.369655610.1109/jsac.2016.2545384 engineering and technology electrical engineering, electronic engineering, information engineering networking & telecommunications
|
||||
dedup_wf_001::e3f892db413a689e572dd256acad55fe 10.1038/ng.366710.1038/ng.3667.10.17615/tct6-4m2610.17863/cam.15649 medical and health sciences health sciences genetics & heredity
|
||||
dedup_wf_001::14ba594e8fd081847bc3f50f56335003 10.1016/j.jclepro.2019.119065 engineering and technology other engineering and technologies building & construction
|
||||
dedup_wf_001::08ac7b33a41bcea2d055ecd8585d632e 10.1111/pce.13392 agricultural and veterinary sciences agriculture, forestry, and fisheries agronomy & agriculture
|
|
|
@ -0,0 +1,37 @@
|
|||
{"doi":"10.1001/amaguidesnewsletters.2019.mayjun02","sbj":"10. No inequality"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2019.novdec01","sbj":"10. No inequality"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2019.sepoct02","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2019.sepoct02","sbj":"8. Economic growth"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.janfeb01","sbj":"8. Economic growth"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.janfeb02","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.janfeb02","sbj":"8. Economic growth"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.julaug01","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.marapr01","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.mayjun01","sbj":"3. Good health"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.mayjun02","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2020.mayjun02","sbj":"10. No inequality"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2021.julaug01","sbj":"1. No poverty"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2021.mayjune01","sbj":"10. No inequality"}
|
||||
{"doi":"10.1001/amaguidesnewsletters.2021.mayjune02","sbj":"10. No inequality"}
|
||||
{"doi":"10.4336/2021.pfb.41e201902078","sbj":"15. Life on land"}
|
||||
{"doi":"10.4337/ejeep.2019.00045","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2019.00050","sbj":"1. No poverty"}
|
||||
{"doi":"10.4337/ejeep.2019.0045","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2019.0050","sbj":"1. No poverty"}
|
||||
{"doi":"10.4337/ejeep.2019.0051","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2019.0052","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2020.0058","sbj":"1. No poverty"}
|
||||
{"doi":"10.4337/ejeep.2020.0058","sbj":"10. No inequality"}
|
||||
{"doi":"10.4337/ejeep.2020.0060","sbj":"10. No inequality"}
|
||||
{"doi":"10.4337/ejeep.2020.0065","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2020.02.03","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2020.02.05","sbj":"8. Economic growth"}
|
||||
{"doi":"10.4337/ejeep.2020.02.06","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/ejeep.2020.02.09","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/roke.2020.01.01","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/roke.2020.01.03","sbj":"16. Peace & justice"}
|
||||
{"doi":"10.4337/roke.2020.01.05","sbj":"1. No poverty"}
|
||||
{"doi":"10.4337/roke.2020.01.05","sbj":"8. Economic growth"}
|
||||
{"doi":"10.4337/roke.2020.01.07","sbj":"8. Economic growth"}
|
||||
{"doi":"10.4337/roke.2020.02.03","sbj":"8. Economic growth"}
|
||||
{"doi":"10.3390/s18072310","sbj":"1. No poverty"}
|
|
@ -0,0 +1,37 @@
|
|||
10.1001/amaguidesnewsletters.2019.mayjun02,10. No inequality
|
||||
10.1001/amaguidesnewsletters.2019.novdec01,10. No inequality
|
||||
10.1001/amaguidesnewsletters.2019.sepoct02,3. Good health
|
||||
10.1001/amaguidesnewsletters.2019.sepoct02,8. Economic growth
|
||||
10.1001/amaguidesnewsletters.2020.janfeb01,8. Economic growth
|
||||
10.1001/amaguidesnewsletters.2020.janfeb02,3. Good health
|
||||
10.1001/amaguidesnewsletters.2020.janfeb02,8. Economic growth
|
||||
10.1001/amaguidesnewsletters.2020.julaug01,3. Good health
|
||||
10.1001/amaguidesnewsletters.2020.marapr01,3. Good health
|
||||
10.1001/amaguidesnewsletters.2020.mayjun01,3. Good health
|
||||
10.1001/amaguidesnewsletters.2020.mayjun02,16. Peace & justice
|
||||
10.1001/amaguidesnewsletters.2020.mayjun02,10. No inequality
|
||||
10.1001/amaguidesnewsletters.2021.julaug01,1. No poverty
|
||||
10.1001/amaguidesnewsletters.2021.mayjune01,10. No inequality
|
||||
10.1001/amaguidesnewsletters.2021.mayjune02,10. No inequality
|
||||
10.4336/2021.pfb.41e201902078,15. Life on land
|
||||
10.4337/ejeep.2019.00045,16. Peace & justice
|
||||
10.4337/ejeep.2019.00050,1. No poverty
|
||||
10.4337/ejeep.2019.0045,16. Peace & justice
|
||||
10.4337/ejeep.2019.0050,1. No poverty
|
||||
10.4337/ejeep.2019.0051,16. Peace & justice
|
||||
10.4337/ejeep.2019.0052,16. Peace & justice
|
||||
10.4337/ejeep.2020.0058,1. No poverty
|
||||
10.4337/ejeep.2020.0058,10. No inequality
|
||||
10.4337/ejeep.2020.0060,10. No inequality
|
||||
10.4337/ejeep.2020.0065,16. Peace & justice
|
||||
10.4337/ejeep.2020.02.03,16. Peace & justice
|
||||
10.4337/ejeep.2020.02.05,8. Economic growth
|
||||
10.4337/ejeep.2020.02.06,16. Peace & justice
|
||||
10.4337/ejeep.2020.02.09,16. Peace & justice
|
||||
10.4337/roke.2020.01.01,16. Peace & justice
|
||||
10.4337/roke.2020.01.03,16. Peace & justice
|
||||
10.4337/roke.2020.01.05,1. No poverty
|
||||
10.4337/roke.2020.01.05,8. Economic growth
|
||||
10.4337/roke.2020.01.07,8. Economic growth
|
||||
10.4337/roke.2020.02.03,8. Economic growth
|
||||
10.4337/roke.2020.02.04,1. No poverty
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue