Merge branch 'beta' into affiliationPropagation

This commit is contained in:
Claudio Atzori 2021-12-14 15:26:56 +01:00
commit 1790fa2d44
125 changed files with 5056 additions and 1362 deletions

2
.gitignore vendored
View File

@ -3,8 +3,6 @@
*.iws *.iws
*.ipr *.ipr
*.iml *.iml
*.ipr
*.iws
*~ *~
.vscode .vscode
.metals .metals

View File

@ -22,9 +22,20 @@
<id>dnet45-releases</id> <id>dnet45-releases</id>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url> <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository> </repository>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
</site>
</distributionManagement> </distributionManagement>
<build> <build>
<extensions>
<extension>
<groupId>org.apache.maven.wagon</groupId>
<artifactId>wagon-ssh</artifactId>
<version>2.10</version>
</extension>
</extensions>
<pluginManagement> <pluginManagement>
<plugins> <plugins>
<plugin> <plugin>
@ -35,7 +46,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId> <artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version> <version>3.9.1</version>
</plugin> </plugin>
</plugins> </plugins>
</pluginManagement> </pluginManagement>
@ -43,6 +54,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties> </properties>
</project> </project>

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -10,6 +10,9 @@
<packaging>pom</packaging> <packaging>pom</packaging>
<description>This module is a container for the build tools used in dnet-hadoop</description> <description>This module is a container for the build tools used in dnet-hadoop</description>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
</properties>
<modules> <modules>
<module>dhp-code-style</module> <module>dhp-code-style</module>
@ -17,4 +20,12 @@
<module>dhp-build-properties-maven-plugin</module> <module>dhp-build-properties-maven-plugin</module>
</modules> </modules>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/</url>
</site>
</distributionManagement>
</project> </project>

View File

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -13,7 +13,51 @@
<artifactId>dhp-common</artifactId> <artifactId>dhp-common</artifactId>
<packaging>jar</packaging> <packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description> <description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies> <dependencies>

View File

@ -57,9 +57,17 @@ public class VocabularyGroup implements Serializable {
final String syn = arr[2].trim(); final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn); vocs.addSynonyms(vocId, termId, syn);
} }
} }
// add the term names as synonyms
vocs.vocs.values().forEach(voc -> {
voc.getTerms().values().forEach(term -> {
voc.addSynonym(term.getName().toLowerCase(), term.getId());
});
});
return vocs; return vocs;
} }

View File

@ -16,6 +16,8 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
@ -86,6 +88,22 @@ public class GraphCleaningFunctions extends CleaningFunctions {
} }
public static <T extends Oaf> boolean filter(T value) { public static <T extends Oaf> boolean filter(T value) {
if (Boolean.TRUE
.equals(
Optional
.ofNullable(value)
.map(
o -> Optional
.ofNullable(o.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getInvisible())
.orElse(true))
.orElse(true))
.orElse(true))) {
return false;
}
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to evaluate here // nothing to evaluate here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -115,7 +133,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return true; return true;
} }
public static <T extends Oaf> T cleanup(T value) { public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -212,6 +230,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.map(GraphCleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getFormat())) {
r
.setFormat(
r
.getFormat()
.stream()
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) { if (Objects.nonNull(r.getDescription())) {
r r
.setDescription( .setDescription(
@ -234,6 +261,38 @@ public class GraphCleaningFunctions extends CleaningFunctions {
if (Objects.nonNull(r.getInstance())) { if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) { for (Instance i : r.getInstance()) {
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
if (r instanceof Publication) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Dataset) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Software) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof OtherResearchProduct) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
}
}
if (Objects.nonNull(i.getPid())) { if (Objects.nonNull(i.getPid())) {
i.setPid(processPidCleaning(i.getPid())); i.setPid(processPidCleaning(i.getPid()));
} }

View File

@ -0,0 +1,72 @@
package eu.dnetlib.dhp.application
import scala.io.Source
/**
* This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*
*/
trait SparkScalaApplication {
/**
* This is the path in the classpath of the json
* describes all the argument needed to run
*/
val propertyPath: String
/**
* Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
parser.parseArgument(args)
parser
}
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
def run(): Unit
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
var spark:SparkSession = null
def initialize():SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
/**
* Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
private def createSparkSession():SparkSession = {
require(parser!= null)
val conf:SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
}
}

View File

@ -107,7 +107,7 @@ class OafMapperUtilsTest {
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get()); assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get()); assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get()); assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get()); assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get()); assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get()); assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get()); assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());

View File

@ -1,69 +0,0 @@
package eu.dnetlib.dhp.actionmanager.scholix
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object SparkCreateActionset {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val workingDirFolder = parser.get("workingDirFolder")
log.info(s"workingDirFolder -> $workingDirFolder")
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
log.info("extract source and target Identifier involved in relations")
log.info("save relation filtered")
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
log.info("saving entities")
val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
entities
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
}
}

View File

@ -1,86 +0,0 @@
package eu.dnetlib.dhp.actionmanager.scholix
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object SparkSaveActionSet {
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/save_actionset.json")).mkString)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
spark.read.load(sourcePath).as[Oaf]
.map(o => toActionSet(o))
.filter(o => o != null)
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
}
}

View File

@ -7,3 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout. # A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.logger.org.apache.spark=FATAL
log4j.logger.org.spark_project=FATAL

View File

@ -0,0 +1,134 @@
package eu.dnetlib.dhp.datacite
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
import java.io.InputStream
import java.time.format.DateTimeFormatter
import java.util.Locale
import java.util.regex.Pattern
import scala.io.Source
/**
* This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI
* @param timestamp timestamp of last update date
* @param isActive the record is active or deleted
* @param json the json native records
*/
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
/*
The following class are utility class used for the mapping from
json datacite to OAF Shema
*/
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class OAFRelations(relation:String, inverse:String, relType:String)
class DataciteModelConstants extends Serializable {
}
object DataciteModelConstants {
val REL_TYPE_VALUE:String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val DATACITE_NAME = "Datacite"
val dataInfo: DataInfo = dataciteDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val subRelTypeMapping: Map[String,OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
)
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
require(stream!= null)
Source.fromInputStream(stream).getLines().toList
}
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
}

View File

@ -2,131 +2,42 @@ package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.datacite.DataciteModelConstants._
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat import java.text.SimpleDateFormat
import java.time.LocalDate import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatter
import java.util.regex.Pattern
import java.util.{Date, Locale} import java.util.{Date, Locale}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
import scala.language.postfixOps
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
object DataciteToOAFTransformation { object DataciteToOAFTransformation {
val REL_TYPE_VALUE:String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val subRelTypeMapping: Map[String,(String,String)] = Map(
"References" ->("IsReferencedBy","relationship"),
"IsSupplementTo" ->("IsSupplementedBy","supplement"),
"IsPartOf" ->("HasPart","part"),
"HasPart" ->("IsPartOf","part"),
"IsVersionOf" ->("HasVersion","version"),
"HasVersion" ->("IsVersionOf","version"),
"IsIdenticalTo" ->("IsIdenticalTo","relationship"),
"IsPreviousVersionOf" ->("IsNewVersionOf","version"),
"IsContinuedBy" ->("Continues","relationship"),
"Continues" ->("IsContinuedBy","relationship"),
"IsNewVersionOf" ->("IsPreviousVersionOf","version"),
"IsSupplementedBy" ->("IsSupplementTo","supplement"),
"IsDocumentedBy" ->("Documents","relationship"),
"IsSourceOf" ->("IsDerivedFrom","relationship"),
"Cites" ->("IsCitedBy","citation"),
"IsCitedBy" ->("Cites","citation"),
"IsDerivedFrom" ->("IsSourceOf","relationship"),
"IsVariantFormOf" ->("IsDerivedFrom","version"),
"IsReferencedBy" ->("References","relationship"),
"IsObsoletedBy" ->("IsNewVersionOf","version"),
"Reviews" ->("IsReviewedBy","review"),
"Documents" ->("IsDocumentedBy","relationship"),
"IsCompiledBy" ->("Compiles","relationship"),
"Compiles" ->("IsCompiledBy","relationship"),
"IsReviewedBy" ->("Reviews","review")
)
implicit val codec: Codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val j_filter: List[String] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
s.lines.toList
}
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val dataInfo: DataInfo = generateDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
val hostedByMap: Map[String, HostedByMapType] = { /**
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString * This method should skip record if json contains invalid text
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats * defined in gile datacite_filter
lazy val json: org.json4s.JValue = parse(s) *
json.extract[Map[String, HostedByMapType]] * @param json
} * @return True if the record should be skipped
*/
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH) def skip_record(json: String): Boolean = {
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) datacite_filter.exists(f => json.contains(f))
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
def filter_json(json: String): Boolean = {
j_filter.exists(f => json.contains(f))
} }
@deprecated("this method will be removed", "dhp")
def toActionSet(item: Oaf): (String, String) = { def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@ -206,6 +117,8 @@ object DataciteToOAFTransformation {
case _: Throwable => "" case _: Throwable => ""
} }
} }
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = { def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) { if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
@ -324,11 +237,7 @@ object DataciteToOAFTransformation {
val p = match_pattern.get._2 val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2") val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}" val targetId = s"$p${DHPUtils.md5(grantId)}"
List( List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo)
// REMOVED INVERSE RELATION since there is a specific method that should generate later
// generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
)
} }
else else
List() List()
@ -337,7 +246,7 @@ object DataciteToOAFTransformation {
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = { def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
if (filter_json(input)) if (skip_record(input))
return List() return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -516,8 +425,8 @@ object DataciteToOAFTransformation {
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
if (client.isDefined) { if (client.isDefined) {
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name)) instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
instance.setCollectedfrom(DATACITE_COLLECTED_FROM) instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier) instance.setAccessright(access_rights_qualifier)
@ -571,7 +480,7 @@ object DataciteToOAFTransformation {
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.setDataInfo(dataInfo) rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType)._2 val subRelType = subRelTypeMapping(r.relationType).relType
rel.setRelType(REL_TYPE_VALUE) rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType) rel.setSubRelType(subRelType)
rel.setRelClass(r.relationType) rel.setRelClass(r.relationType)
@ -585,18 +494,9 @@ object DataciteToOAFTransformation {
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel rel
}).toList })
} }
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
di
}
def generateDSId(input: String): String = { def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::") val b = StringUtils.substringBefore(input, "::")

View File

@ -1,64 +1,94 @@
package eu.dnetlib.dhp.datacite package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.AbstractScalaApplication
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.common.Constants.MDSTORE_SIZE_PATH
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord} import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
import eu.dnetlib.dhp.utils.ISLookupClientFactory import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object GenerateDataciteDatasetSpark { class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"SourcePath is '$sourcePath'")
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks")) val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
log.info(s"exportLinks is '$exportLinks'")
val isLookupUrl: String = parser.get("isLookupUrl") val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl) log.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession = SparkSession.builder().config(conf) require(vocabularies != null)
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
.master(master)
.getOrCreate()
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
val mapper = new ObjectMapper()
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info(s"outputBasePath is '$outputBasePath'")
val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH"
log.info(s"targetPath is '$targetPath'")
generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark)
reportTotalSize(targetPath, outputBasePath)
}
/**
* For working with MDStore we need to store in a file on hdfs the size of
* the current dataset
* @param targetPath
* @param outputBasePath
*/
def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
val total_items = spark.read.load(targetPath).count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
}
/**
* Generate the transformed and cleaned OAF Dataset from the native one
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
* @param exportLinks If true it generates unresolved links
* @param vocabularies vocabularies for cleaning
* @param targetPath the targetPath of the result Dataset
*/
def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
require(spark!= null)
import spark.implicits._ import spark.implicits._
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord] implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
val mapper = new ObjectMapper()
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH"
spark.read.load(sourcePath).as[DataciteType] spark.read.load(sourcePath).as[DataciteType]
.filter(d => d.isActive) .filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)) .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
.filter(d => d != null) .filter(d => d != null)
.flatMap(i => fixRelations(i)).filter(i => i != null) .flatMap(i => fixRelations(i)).filter(i => i != null)
.write.mode(SaveMode.Overwrite).save(targetPath) .write.mode(SaveMode.Overwrite).save(targetPath)
}
val total_items = spark.read.load(targetPath).as[Oaf].count() }
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
} }
} }

View File

@ -7,6 +7,7 @@ import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.json4s.jackson.JsonMethods.{compact, parse, render}
import collection.JavaConverters._ import collection.JavaConverters._
object BioDBToOAF { object BioDBToOAF {
case class EBILinkItem(id: Long, links: String) {} case class EBILinkItem(id: Long, links: String) {}

View File

@ -1,9 +1,9 @@
package eu.dnetlib.dhp.sx.bio package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
import BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.collection.CollectionUtils import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.Result import eu.dnetlib.dhp.schema.oaf.Result
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf} import eu.dnetlib.dhp.sx.bio.pubmed._
import eu.dnetlib.dhp.utils.ISLookupClientFactory import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration

View File

@ -1,9 +1,8 @@
package eu.dnetlib.dhp.sx.bio.ebi package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.http.client.config.RequestConfig import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.HttpGet import org.apache.http.client.methods.HttpGet

View File

@ -1,11 +1,10 @@
package eu.dnetlib.dhp.sx.bio.ebi package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.BioDBToOAF import eu.dnetlib.dhp.sx.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
import BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.collection.CollectionUtils
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql._ import org.apache.spark.sql._

View File

@ -4,7 +4,7 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf._
import scala.collection.JavaConverters._ import collection.JavaConverters._
import java.util.regex.Pattern import java.util.regex.Pattern
@ -22,10 +22,10 @@ object PubMedToOaf {
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
/** /**
* Cleaning the DOI Applying regex in order to * Cleaning the DOI Applying regex in order to
* remove doi starting with URL * remove doi starting with URL
*
* @param doi input DOI * @param doi input DOI
* @return cleaned DOI * @return cleaned DOI
*/ */
@ -93,7 +93,6 @@ object PubMedToOaf {
* @param vocabularyName the input vocabulary name * @param vocabularyName the input vocabulary name
* @param vocabularies all the vocabularies * @param vocabularies all the vocabularies
* @param term the term to find * @param term the term to find
*
* @return the cleaned term value * @return the cleaned term value
*/ */
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
@ -106,7 +105,6 @@ object PubMedToOaf {
/** /**
* Map the Pubmed Article into the OAF instance * Map the Pubmed Article into the OAF instance
* *
*
* @param article the pubmed articles * @param article the pubmed articles
* @param vocabularies the vocabularies * @param vocabularies the vocabularies
* @return The OAF instance if the mapping did not fail * @return The OAF instance if the mapping did not fail
@ -185,7 +183,6 @@ object PubMedToOaf {
//-------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------
// RESULT MAPPING // RESULT MAPPING
//-------------------------------------------------------------------------------------- //--------------------------------------------------------------------------------------
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))

View File

@ -1,9 +1,20 @@
##DHP-Aggregation ##DHP-Aggregation
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. This module defines a set of oozie workflows for
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure 1. the **collection** and **transformation** of metadata records.
2. the **integration** of new external information in the result
### Collection and Transformation
The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore. of each MDStore.
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section). It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
### Integration of external information in the result
The workflows create new entity in the OpenAIRE format (OAF) which aim is to enrich the result already contained in the graph.
See integration section for more insight

View File

@ -0,0 +1,36 @@
DHP Aggregation - Integration method
=====================================
The integration method can be applied every time new information, which is not aggregated from the repositories
nor computed directly by OpenAIRE, should be added to the results of the graph.
The information integrated so far is:
1. Article impact measures
1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores
2. Result Subjects
1. Integration of Fields od Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS)) classification in
results subjects.
The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id
and the element in the OAF model that should be used to map the information we want to integrate.
The id is set by using a particular encoding of the given PID
*unresolved:[pid]:[pidtype]*
where
1. *unresolved* is a constant value
2. *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307
3. *pidtype* is the persistent id type, e.g. doi
Such entities are matched against those available in the graph using the result.instance.pid values.
This mechanism can be used to integrate enrichments produced as associated by a given PID.
If a match will be found with one of the results already in the graph that said result will be enriched with the information
present in the new OAF.
All the objects for which a match is not found are discarded.

View File

@ -4,13 +4,13 @@ This section describes the mapping implemented for [MEDLINE/PubMed](https://pubm
Collection Collection
--------- ---------
The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with
the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html) the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
Parsing Parsing
------- -------
The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html) an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
Mapping Mapping
@ -50,6 +50,10 @@ The table below describes the mapping from the XML Native to the OAF mapping
|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist | |//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
|FOR ALL AUTHOR | author.rank| sequential number starting from 1| |FOR ALL AUTHOR | author.rank| sequential number starting from 1|
#TODO
Missing item mapped

View File

@ -20,7 +20,9 @@
<item name="Pubmed" href="pubmed.html"/> <item name="Pubmed" href="pubmed.html"/>
<item name="Datacite" href="datacite.html"/> <item name="Datacite" href="datacite.html"/>
</item> </item>
<item name="Release Notes" href="release-notes.html" /> <item name="Integration" href="integration.html" collapse="true">
</item>
<item name="General Information" href="about.html"/> <item name="General Information" href="about.html"/>
<item name="JavaDoc" href="apidocs/" /> <item name="JavaDoc" href="apidocs/" />

View File

@ -89,13 +89,13 @@ public class CreateOpenCitationsASTest {
"-inputPath", "-inputPath",
inputPath, inputPath,
"-outputPath", "-outputPath",
workingDir.toString() + "/actionSet" workingDir.toString() + "/actionSet1"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) .sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
@ -121,13 +121,13 @@ public class CreateOpenCitationsASTest {
"-inputPath", "-inputPath",
inputPath, inputPath,
"-outputPath", "-outputPath",
workingDir.toString() + "/actionSet" workingDir.toString() + "/actionSet2"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) .sequenceFile(workingDir.toString() + "/actionSet2", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
@ -153,13 +153,13 @@ public class CreateOpenCitationsASTest {
"-inputPath", "-inputPath",
inputPath, inputPath,
"-outputPath", "-outputPath",
workingDir.toString() + "/actionSet" workingDir.toString() + "/actionSet3"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) .sequenceFile(workingDir.toString() + "/actionSet3", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
@ -186,13 +186,13 @@ public class CreateOpenCitationsASTest {
"-inputPath", "-inputPath",
inputPath, inputPath,
"-outputPath", "-outputPath",
workingDir.toString() + "/actionSet" workingDir.toString() + "/actionSet4"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) .sequenceFile(workingDir.toString() + "/actionSet4", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
@ -226,13 +226,13 @@ public class CreateOpenCitationsASTest {
"-inputPath", "-inputPath",
inputPath, inputPath,
"-outputPath", "-outputPath",
workingDir.toString() + "/actionSet" workingDir.toString() + "/actionSet5"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) .sequenceFile(workingDir.toString() + "/actionSet5", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
@ -261,13 +261,13 @@ public class CreateOpenCitationsASTest {
"-inputPath", "-inputPath",
inputPath, inputPath,
"-outputPath", "-outputPath",
workingDir.toString() + "/actionSet" workingDir.toString() + "/actionSet6"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) .sequenceFile(workingDir.toString() + "/actionSet6", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
@ -306,13 +306,13 @@ public class CreateOpenCitationsASTest {
"-inputPath", "-inputPath",
inputPath, inputPath,
"-outputPath", "-outputPath",
workingDir.toString() + "/actionSet" workingDir.toString() + "/actionSet7"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) .sequenceFile(workingDir.toString() + "/actionSet7", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));

View File

@ -1,56 +0,0 @@
package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import java.text.SimpleDateFormat
import java.util.Locale
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class DataciteToOAFTest extends AbstractVocabularyTest{
@BeforeEach
def setUp() :Unit = {
super.setUpVocabulary()
}
@Test
def testDateMapping:Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
}
@Test
def testMapping() :Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
res.foreach(r => {
println (mapper.writeValueAsString(r))
println("----------------------------")
})
}
}

View File

@ -0,0 +1,114 @@
package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.{col, count}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.{Files, Path}
import java.text.SimpleDateFormat
import java.util.Locale
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class DataciteToOAFTest extends AbstractVocabularyTest{
private var workingDir:Path = null
val log: Logger = LoggerFactory.getLogger(getClass)
@BeforeEach
def setUp() :Unit = {
workingDir= Files.createTempDirectory(getClass.getSimpleName)
super.setUpVocabulary()
}
@AfterEach
def tearDown() :Unit = {
FileUtils.deleteDirectory(workingDir.toFile)
}
@Test
def testDateMapping:Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
}
@Test
def testConvert(): Unit = {
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
val conf = new SparkConf()
val spark:SparkSession = SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
val instance = new GenerateDataciteDatasetSpark(null, null, log)
val targetPath = s"$workingDir/result"
instance.generateDataciteDataset(path, exportLinks = true, vocabularies,targetPath, spark)
import spark.implicits._
val nativeSize =spark.read.load(path).count()
assertEquals(100, nativeSize)
val result:Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false)
val t = spark.read.load(targetPath).count()
assertTrue(t >0)
spark.stop()
}
@Test
def testMapping() :Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
res.foreach(r => {
println (mapper.writeValueAsString(r))
println("----------------------------")
})
}
}

View File

@ -90,7 +90,7 @@ public class ReadBlacklistFromDB implements Closeable {
inverse.setSource(target_direct); inverse.setSource(target_direct);
String encoding = rs.getString("relationship"); String encoding = rs.getString("relationship");
RelationInverse ri = ModelSupport.relationInverseMap.get(encoding); RelationInverse ri = ModelSupport.findInverse(encoding);
direct.setRelClass(ri.getRelClass()); direct.setRelClass(ri.getRelClass());
inverse.setRelClass(ri.getInverseRelClass()); inverse.setRelClass(ri.getInverseRelClass());
direct.setRelType(ri.getRelType()); direct.setRelType(ri.getRelType());

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.blacklist;
import java.util.Arrays;
import java.util.List;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.common.RelationInverse;
public class BlacklistRelationTest {
@Test
public void testRelationInverseLookup() {
final List<String> rels = Arrays
.asList(
"resultResult_relationship_IsRelatedTo",
"resultOrganization_affiliation_isAuthorInstitutionOf",
"resultOrganization_affiliation_hasAuthorInstitution",
"datasourceOrganization_provision_isProvidedBy",
"projectOrganization_participation_hasParticipant",
"resultProject_outcome_produces",
"resultProject_outcome_isProducedBy");
rels.forEach(r -> {
RelationInverse inverse = ModelSupport.relationInverseMap.get(r);
Assertions.assertNotNull(inverse);
Assertions.assertNotNull(inverse.getRelType());
Assertions.assertNotNull(inverse.getSubReltype());
Assertions.assertNotNull(inverse.getRelClass());
});
}
}

View File

@ -19,7 +19,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
class UpdateMatcherTest { public class UpdateMatcherTest {
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate(); UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();

View File

@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerMainEntity;
class EnrichMissingPublicationDateTest { public class EnrichMissingPublicationDateTest {
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate(); final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();

View File

@ -8,7 +8,7 @@ import java.util.Arrays;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
class SubscriptionUtilsTest { public class SubscriptionUtilsTest {
@Test @Test
void testVerifyListSimilar() { void testVerifyListSimilar() {

View File

@ -9,7 +9,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor;
import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.broker.objects.OaBrokerTypedValue;
class TrustUtilsTest { public class TrustUtilsTest {
private static final double THRESHOLD = 0.95; private static final double THRESHOLD = 0.95;

View File

@ -139,12 +139,26 @@ abstract class AbstractSparkAction implements Serializable {
protected boolean isOpenorgs(Relation rel) { protected boolean isOpenorgs(Relation rel) {
return Optional return Optional
.ofNullable(rel.getCollectedfrom()) .ofNullable(rel.getCollectedfrom())
.map( .map(c -> isCollectedFromOpenOrgs(c))
c -> c .orElse(false);
}
protected boolean isOpenorgsDedupRel(Relation rel) {
return isOpenorgs(rel) && isOpenOrgsDedupMergeRelation(rel);
}
private boolean isCollectedFromOpenOrgs(List<KeyValue> c) {
return c
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.anyMatch(kv -> ModelConstants.OPENORGS_NAME.equals(kv.getValue()))) .anyMatch(kv -> ModelConstants.OPENORGS_NAME.equals(kv.getValue()));
.orElse(false); }
private boolean isOpenOrgsDedupMergeRelation(Relation rel) {
return ModelConstants.ORG_ORG_RELTYPE.equals(rel.getRelType()) &&
ModelConstants.DEDUP.equals(rel.getSubRelType())
&& (ModelConstants.IS_MERGED_IN.equals(rel.getRelClass()) ||
ModelConstants.MERGES.equals(rel.getRelClass()));
} }
protected static Boolean parseECField(Field<String> field) { protected static Boolean parseECField(Field<String> field) {

View File

@ -61,7 +61,7 @@ public class SparkCopyRelationsNoOpenorgs extends AbstractSparkAction {
.textFile(relationPath) .textFile(relationPath)
.map(patchRelFn(), Encoders.bean(Relation.class)) .map(patchRelFn(), Encoders.bean(Relation.class))
.toJavaRDD() .toJavaRDD()
.filter(x -> !isOpenorgs(x)); .filter(x -> !isOpenorgsDedupRel(x));
if (log.isDebugEnabled()) { if (log.isDebugEnabled()) {
log.debug("Number of non-Openorgs relations collected: {}", simRels.count()); log.debug("Number of non-Openorgs relations collected: {}", simRels.count());

View File

@ -11,6 +11,8 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -29,6 +31,8 @@ import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -226,9 +230,10 @@ public class SparkOpenorgsProvisionTest implements Serializable {
new SparkCopyRelationsNoOpenorgs(parser, spark).run(isLookUpService); new SparkCopyRelationsNoOpenorgs(parser, spark).run(isLookUpService);
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); final JavaRDD<String> rels = jsc.textFile(testDedupGraphBasePath + "/relation");
assertEquals(2382, rels.count());
assertEquals(2380, relations);
} }
@Test @Test
@ -250,7 +255,7 @@ public class SparkOpenorgsProvisionTest implements Serializable {
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
assertEquals(4894, relations); assertEquals(4896, relations);
// check deletedbyinference // check deletedbyinference
final Dataset<Relation> mergeRels = spark final Dataset<Relation> mergeRels = spark

View File

@ -2518,3 +2518,5 @@
{"subRelType": "dedup", "relClass": "isMergedIn", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|openorgs____::5c351d85f02db01ca291acd119f0bd78", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|opendoar____::37248e2f6987b18670dd2b8a51d6ef55", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []} {"subRelType": "dedup", "relClass": "isMergedIn", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|openorgs____::5c351d85f02db01ca291acd119f0bd78", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|opendoar____::37248e2f6987b18670dd2b8a51d6ef55", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}
{"subRelType": "dedup", "relClass": "merges", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []} {"subRelType": "dedup", "relClass": "merges", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}
{"subRelType": "dedup", "relClass": "isMergedIn", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []} {"subRelType": "dedup", "relClass": "isMergedIn", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}
{"subRelType": "relationship", "relClass": "IsParentOf", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}
{"subRelType": "relationship", "relClass": "IsChildOf", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}

View File

@ -1,21 +1,19 @@
package eu.dnetlib.doiboost package eu.dnetlib.doiboost
import java.time.LocalDate import com.fasterxml.jackson.databind.ObjectMapper
import java.time.format.DateTimeFormatter
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{AccessRight, DataInfo, Dataset, Field, Instance, KeyValue, Oaf, OpenAccessRoute, Organization, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils import org.apache.commons.lang3.StringUtils
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{getClosedAccessQualifier, getEmbargoedAccessQualifier, getUnknownQualifier}
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._

View File

@ -8,11 +8,12 @@ import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkGenerateDOIBoostActionSet { object SparkGenerateDOIBoostActionSet {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
@ -55,7 +56,6 @@ object SparkGenerateDOIBoostActionSet {
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val asCRelation = spark.read.load(crossRefRelation).as[Relation] val asCRelation = spark.read.load(crossRefRelation).as[Relation]
.filter(r => r != null && r.getSource != null && r.getTarget != null) .filter(r => r != null && r.getSource != null && r.getTarget != null)
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
@ -65,23 +65,12 @@ object SparkGenerateDOIBoostActionSet {
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING)) .map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation) val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec]) d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
} }
} }

View File

@ -9,14 +9,12 @@ import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.functions.col import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString,JArray} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkGenerateDoiBoost { object SparkGenerateDoiBoost {
@ -38,7 +36,6 @@ object SparkGenerateDoiBoost {
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)
@ -118,8 +115,7 @@ object SparkGenerateDoiBoost {
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p)) val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p)) val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
def applyMerge(item:((String, Publication), (String, Publication))) : Publication = def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
{
val crossrefPub = item._1._2 val crossrefPub = item._1._2
if (item._2 != null) { if (item._2 != null) {
val otherPub = item._2._2 val otherPub = item._2._2
@ -130,6 +126,7 @@ object SparkGenerateDoiBoost {
} }
crossrefPub crossrefPub
} }
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin") crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
logger.info("Phase 3) Join Result with ORCID") logger.info("Phase 3) Join Result with ORCID")
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p)) val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
@ -169,7 +166,6 @@ object SparkGenerateDoiBoost {
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation] .select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication] val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null) .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null)
@ -198,8 +194,6 @@ object SparkGenerateDoiBoost {
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved") })(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => { val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => {
if (r.getSource.startsWith("unresolved")) if (r.getSource.startsWith("unresolved"))

View File

@ -4,20 +4,19 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{decideAccessRight, _} import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.apache.commons.lang.StringUtils import org.apache.commons.lang.StringUtils
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JValue, _} import org.json4s.JsonAST._
import org.json4s.jackson.JsonMethods._ import org.json4s.jackson.JsonMethods._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.util
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.collection.mutable import scala.collection.mutable
import scala.util.matching.Regex import scala.util.matching.Regex
import java.util
import eu.dnetlib.doiboost.DoiBoostMappingUtil
case class CrossrefDT(doi: String, json:String, timestamp: Long) {} case class CrossrefDT(doi: String, json:String, timestamp: Long) {}

View File

@ -6,7 +6,7 @@ import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.{IntWritable, Text} import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -30,7 +30,6 @@ object CrossrefDataset {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json"))) val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
parser.parseArgument(args) parser.parseArgument(args)

View File

@ -2,17 +2,12 @@ package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item
import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.JArray import org.json4s.jackson.JsonMethods.parse
import org.json4s.jackson.JsonMethods.{compact, parse, render}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source import scala.io.Source
@ -24,7 +19,6 @@ object GenerateCrossrefDataset {
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT] implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
def crossrefElement(meta: String): CrossrefDT = { def crossrefElement(meta: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(meta) lazy val json: json4s.JValue = parse(meta)

View File

@ -4,10 +4,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf import eu.dnetlib.dhp.schema.oaf
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}

View File

@ -2,8 +2,8 @@ package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.JArray import org.json4s.JsonAST.JArray
@ -17,8 +17,6 @@ object UnpackCrtossrefEntries {
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass) val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
def extractDump(input: String): List[String] = { def extractDump(input: String): List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input) lazy val json: json4s.JValue = parse(input)
@ -30,7 +28,6 @@ object UnpackCrtossrefEntries {
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf = new SparkConf val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString) val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)

View File

@ -5,10 +5,10 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.collection.mutable import scala.collection.mutable

View File

@ -3,8 +3,8 @@ package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.types._ import org.apache.spark.sql.types._
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkImportMagIntoDataset { object SparkImportMagIntoDataset {
@ -75,7 +75,6 @@ object SparkImportMagIntoDataset {
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
stream.foreach { case (k, v) => stream.foreach { case (k, v) =>
val s: StructType = getSchema(k) val s: StructType = getSchema(k)
val df = spark.read val df = spark.read

View File

@ -5,13 +5,10 @@ import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.{col, collect_list, struct}
import org.apache.spark.sql.functions._
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
object SparkProcessMAG { object SparkProcessMAG {
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = { def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
@ -20,7 +17,7 @@ object SparkProcessMAG {
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2)) .reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
.map(_._2)(Encoders.product[MagPapers]) .map(_._2)(Encoders.product[MagPapers])
.map(mp => { .map(mp => {
new MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi), MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
mp.DocType, mp.PaperTitle, mp.OriginalTitle, mp.DocType, mp.PaperTitle, mp.OriginalTitle,
mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String, mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String,
mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId, mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId,
@ -146,13 +143,12 @@ object SparkProcessMAG {
.save(s"$workingPath/mag_publication") .save(s"$workingPath/mag_publication")
spark.read.load(s"$workingPath/mag_publication").as[Publication] spark.read.load(s"$workingPath/mag_publication").as[Publication]
.filter(p => p.getId == null) .filter(p => p.getId != null)
.groupByKey(p => p.getId) .groupByKey(p => p.getId)
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b)) .reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
.map(_._2) .map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
} }
} }

View File

@ -4,17 +4,16 @@ import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
import eu.dnetlib.dhp.schema.orcid.{AuthorData, OrcidDOI}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo} import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
import org.apache.commons.lang.StringUtils import org.apache.commons.lang.StringUtils
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST._ import org.json4s.JsonAST._
import org.json4s.jackson.JsonMethods._ import org.json4s.jackson.JsonMethods._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){} case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){}

View File

@ -1,15 +1,12 @@
package eu.dnetlib.doiboost.orcid package eu.dnetlib.doiboost.orcid
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions.{col, collect_list}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkPreprocessORCID { object SparkPreprocessORCID {

View File

@ -1,16 +1,14 @@
package eu.dnetlib.doiboost.uw package eu.dnetlib.doiboost.uw
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkMapUnpayWallToOAF { object SparkMapUnpayWallToOAF {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {

View File

@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication} import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication}
import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import eu.dnetlib.doiboost.uw.UnpayWallToOAF.get_unpaywall_color

View File

@ -1,70 +0,0 @@
package eu.dnetlib.dhp.doiboost
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
import eu.dnetlib.doiboost.{DoiBoostMappingUtil, HostedByItemType}
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
import eu.dnetlib.doiboost.mag.ConversionUtil
import eu.dnetlib.doiboost.orcid.ORCIDElement
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Test
import scala.io.Source
class DoiBoostHostedByMapTest {
// @Test
// def testMerge():Unit = {
// val conf: SparkConf = new SparkConf()
// val spark: SparkSession =
// SparkSession
// .builder()
// .config(conf)
// .appName(getClass.getSimpleName)
// .master("local[*]").getOrCreate()
//
//
//
// implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
// implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
// implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
//
//
// import spark.implicits._
// val dataset:RDD[String]= spark.sparkContext.textFile("/home/sandro/Downloads/hbMap.gz")
//
//
// val hbMap:Dataset[(String, HostedByItemType)] =spark.createDataset(dataset.map(DoiBoostMappingUtil.toHostedByItem))
//
//
// hbMap.show()
//
//
//
//
//
//
//
//
//
//
// }
@Test
def idDSGeneration():Unit = {
val s ="doajarticles::0066-782X"
println(DoiBoostMappingUtil.generateDSId(s))
}
}

View File

@ -0,0 +1,20 @@
package eu.dnetlib.dhp.doiboost
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.junit.jupiter.api.Test
class DoiBoostHostedByMapTest {
@Test
def idDSGeneration():Unit = {
val s ="doajarticles::0066-782X"
println(DoiBoostMappingUtil.generateDSId(s))
}
}

View File

@ -1,7 +1,8 @@
package eu.dnetlib.doiboost.crossref package eu.dnetlib.dhp.doiboost.crossref
import eu.dnetlib.dhp.schema.oaf._ import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.doiboost.crossref.Crossref2Oaf
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
@ -21,9 +22,9 @@ class CrossrefMappingTest {
@Test @Test
def testFunderRelationshipsMapping(): Unit = { def testFunderRelationshipsMapping(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("funder_doi")).mkString val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
val funder_name = Source.fromInputStream(getClass.getResourceAsStream("funder_doi")).mkString val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
for (line <- funder_doi.lines) { for (line <- funder_doi.lines) {
@ -72,7 +73,7 @@ class CrossrefMappingTest {
@Test @Test
def testOrcidID() :Unit = { def testOrcidID() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("orcid_data.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString
assertNotNull(json) assertNotNull(json)
@ -93,7 +94,7 @@ class CrossrefMappingTest {
@Test @Test
def testEmptyTitle() :Unit = { def testEmptyTitle() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("empty_title.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString
assertNotNull(json) assertNotNull(json)
@ -115,7 +116,7 @@ class CrossrefMappingTest {
@Test @Test
def testPeerReviewed(): Unit = { def testPeerReviewed(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("prwTest.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
assertNotNull(json) assertNotNull(json)
@ -156,7 +157,7 @@ class CrossrefMappingTest {
@Test @Test
def testJournalRelation(): Unit = { def testJournalRelation(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("awardTest.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty) assertFalse(json.isEmpty)
@ -177,7 +178,7 @@ class CrossrefMappingTest {
@Test @Test
def testConvertBookFromCrossRef2Oaf(): Unit = { def testConvertBookFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("book.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -233,7 +234,7 @@ class CrossrefMappingTest {
@Test @Test
def testConvertPreprintFromCrossRef2Oaf(): Unit = { def testConvertPreprintFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("preprint.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -291,7 +292,7 @@ class CrossrefMappingTest {
@Test @Test
def testConvertDatasetFromCrossRef2Oaf(): Unit = { def testConvertDatasetFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("dataset.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -332,7 +333,7 @@ class CrossrefMappingTest {
@Test @Test
def testConvertArticleFromCrossRef2Oaf(): Unit = { def testConvertArticleFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -400,7 +401,7 @@ class CrossrefMappingTest {
@Test @Test
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = { def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("dump_file.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty); assertFalse(json.isEmpty);
@ -415,55 +416,12 @@ class CrossrefMappingTest {
assert(items.size == 1) assert(items.size == 1)
val result: Result = items.head.asInstanceOf[Publication] val result: Result = items.head.asInstanceOf[Publication]
assertNotNull(result) assertNotNull(result)
logger.info(mapper.writeValueAsString(result)); logger.info(mapper.writeValueAsString(result));
// assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
// assertNotNull(
// result.getDataInfo.getProvenanceaction,
// "DataInfo/Provenance test not null Failed");
// assertFalse(
// result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
// "DataInfo/Provenance/classId test not null Failed");
// assertFalse(
// result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
// "DataInfo/Provenance/className test not null Failed");
// assertFalse(
// result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
// "DataInfo/Provenance/SchemeId test not null Failed");
// assertFalse(
// result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
// "DataInfo/Provenance/SchemeName test not null Failed");
//
// assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
// assertFalse(result.getCollectedfrom.isEmpty);
//
// val collectedFromList = result.getCollectedfrom.asScala
// assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
//
// assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
//
//
// val relevantDates = result.getRelevantdate.asScala
//
// assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
//
// val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
// assertFalse(rels.isEmpty)
// rels.foreach(relation => {
// assertNotNull(relation)
// assertFalse(relation.getSource.isEmpty)
// assertFalse(relation.getTarget.isEmpty)
// assertFalse(relation.getRelClass.isEmpty)
// assertFalse(relation.getRelType.isEmpty)
// assertFalse(relation.getSubRelType.isEmpty)
//
// })
} }
@Test @Test
def testNormalizeDOI(): Unit = { def testNormalizeDOI(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}]," val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val json = template.replace("%s", line) val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json) val resultList: List[Oaf] = Crossref2Oaf.convert(json)
@ -479,7 +437,7 @@ class CrossrefMappingTest {
@Test @Test
def testNormalizeDOI2(): Unit = { def testNormalizeDOI2(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
val resultList: List[Oaf] = Crossref2Oaf.convert(template) val resultList: List[Oaf] = Crossref2Oaf.convert(template)
assertTrue(resultList.nonEmpty) assertTrue(resultList.nonEmpty)
@ -494,7 +452,7 @@ class CrossrefMappingTest {
@Test @Test
def testLicenseVorClosed() :Unit = { def testLicenseVorClosed() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_vor.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString
assertNotNull(json) assertNotNull(json)
@ -521,7 +479,7 @@ class CrossrefMappingTest {
@Test @Test
def testLicenseOpen() :Unit = { def testLicenseOpen() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_open.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString
assertNotNull(json) assertNotNull(json)
@ -544,7 +502,7 @@ class CrossrefMappingTest {
@Test @Test
def testLicenseEmbargoOpen() :Unit = { def testLicenseEmbargoOpen() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo_open.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString
assertNotNull(json) assertNotNull(json)
@ -567,7 +525,7 @@ class CrossrefMappingTest {
@Test @Test
def testLicenseEmbargo() :Unit = { def testLicenseEmbargo() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString
assertNotNull(json) assertNotNull(json)
@ -591,7 +549,7 @@ class CrossrefMappingTest {
@Test @Test
def testLicenseEmbargoDateTime() :Unit = { def testLicenseEmbargoDateTime() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo_datetime.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString
assertNotNull(json) assertNotNull(json)
@ -614,7 +572,7 @@ class CrossrefMappingTest {
@Test @Test
def testMultipleURLs() :Unit = { def testMultipleURLs() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("multiple_urls.json")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString
assertNotNull(json) assertNotNull(json)

View File

@ -1,11 +1,12 @@
package eu.dnetlib.doiboost.mag package eu.dnetlib.dhp.doiboost.mag
import eu.dnetlib.doiboost.mag.{ConversionUtil, MagPapers, SparkProcessMAG}
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.{Dataset, SparkSession}
import org.codehaus.jackson.map.ObjectMapper import org.codehaus.jackson.map.ObjectMapper
import org.json4s.DefaultFormats
import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.sql.Timestamp import java.sql.Timestamp
@ -47,7 +48,7 @@ class MAGMappingTest {
@Test @Test
def buildInvertedIndexTest(): Unit = { def buildInvertedIndexTest(): Unit = {
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString
val description = ConversionUtil.convertInvertedIndexString(json_input) val description = ConversionUtil.convertInvertedIndexString(json_input)
assertNotNull(description) assertNotNull(description)
assertTrue(description.nonEmpty) assertTrue(description.nonEmpty)
@ -71,7 +72,7 @@ class MAGMappingTest {
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.config(conf) .config(conf)
.getOrCreate() .getOrCreate()
val path = getClass.getResource("magPapers.json").getPath val path = getClass.getResource("/eu/dnetlib/doiboost/mag/magPapers.json").getPath
import org.apache.spark.sql.Encoders import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema val schema = Encoders.product[MagPapers].schema
@ -101,7 +102,7 @@ class MAGMappingTest {
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.config(conf) .config(conf)
.getOrCreate() .getOrCreate()
val path = getClass.getResource("duplicatedMagPapers.json").getPath val path = getClass.getResource("/eu/dnetlib/doiboost/mag/duplicatedMagPapers.json").getPath
import org.apache.spark.sql.Encoders import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema val schema = Encoders.product[MagPapers].schema

View File

@ -1,7 +1,8 @@
package eu.dnetlib.doiboost.orcid package eu.dnetlib.dhp.doiboost.orcid
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.Publication import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.orcid._
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Assertions._
@ -10,9 +11,8 @@ import org.junit.jupiter.api.io.TempDir
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.Path import java.nio.file.Path
import scala.io.Source
import scala.collection.JavaConversions._ import scala.collection.JavaConversions._
import scala.io.Source
class MappingORCIDToOAFTest { class MappingORCIDToOAFTest {
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
@ -20,7 +20,7 @@ class MappingORCIDToOAFTest {
@Test @Test
def testExtractData():Unit ={ def testExtractData():Unit ={
val json = Source.fromInputStream(getClass.getResourceAsStream("dataOutput")).mkString val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString
assertNotNull(json) assertNotNull(json)
assertFalse(json.isEmpty) assertFalse(json.isEmpty)
json.lines.foreach(s => { json.lines.foreach(s => {

View File

@ -1,13 +1,13 @@
package eu.dnetlib.doiboost.uw package eu.dnetlib.dhp.doiboost.uw
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.OpenAccessRoute import eu.dnetlib.dhp.schema.oaf.OpenAccessRoute
import eu.dnetlib.doiboost.uw.UnpayWallToOAF
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source import scala.io.Source
import org.junit.jupiter.api.Assertions._
import org.slf4j.{Logger, LoggerFactory}
class UnpayWallMappingTest { class UnpayWallMappingTest {
@ -18,7 +18,7 @@ class UnpayWallMappingTest {
@Test @Test
def testMappingToOAF():Unit ={ def testMappingToOAF():Unit ={
val Ilist = Source.fromInputStream(getClass.getResourceAsStream("input.json")).mkString val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString
var i:Int = 0 var i:Int = 0
for (line <-Ilist.lines) { for (line <-Ilist.lines) {

View File

@ -0,0 +1 @@
#DHP Enrichment

View File

@ -0,0 +1,26 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu name="Documentation">
<item name="Link1 Collapsable" href="about.html" collapse="true">
<item name="item1" href="pubmed.html"/>
<item name="item2" href="datacite.html"/>
</item>
</menu>
<menu ref="reports"/>
</body>
</project>

View File

@ -88,7 +88,7 @@ public class CleanGraphSparkJob {
readTableFromPath(spark, inputPath, clazz) readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz)) .map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) .map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) GraphCleaningFunctions::cleanup, Encoders.bean(clazz)) .map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
.filter((FilterFunction<T>) GraphCleaningFunctions::filter) .filter((FilterFunction<T>) GraphCleaningFunctions::filter)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)

View File

@ -30,6 +30,11 @@ public class OafCleaner implements Serializable {
} }
} else if (hasMapping(o, mapping)) { } else if (hasMapping(o, mapping)) {
mapping.get(o.getClass()).accept(o); mapping.get(o.getClass()).accept(o);
for (final Field f : getAllFields(o.getClass())) {
f.setAccessible(true);
final Object val = f.get(o);
navigate(val, mapping);
}
} else { } else {
for (final Field f : getAllFields(o.getClass())) { for (final Field f : getAllFields(o.getClass())) {
f.setAccessible(true); f.setAccessible(true);

View File

@ -8,3 +8,12 @@ CREATE VIEW IF NOT EXISTS ${hiveDbName}.result as
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.software s select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.software s
union all union all
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.otherresearchproduct o; select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.otherresearchproduct o;
ANALYZE TABLE ${hiveDbName}.datasource COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.organization COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.project COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.publication COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.dataset COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.otherresearchproduct COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.software COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.relation COMPUTE STATISTICS;

View File

@ -292,7 +292,7 @@
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Import table project</name> <name>Import table relation</name>
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class> <class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>

View File

@ -8,6 +8,15 @@
<name>unresolvedPath</name> <name>unresolvedPath</name>
<description>the path of the unresolved Entities</description> <description>the path of the unresolved Entities</description>
</property> </property>
<property>
<name>targetPath</name>
<description>the target path after resolution</description>
</property>
<property>
<name>shouldResolveEntities</name>
<value>true</value>
<description>allows to activate/deactivate the resolution process over the entities</description>
</property>
</parameters> </parameters>
<start to="ResolveRelations"/> <start to="ResolveRelations"/>
@ -36,11 +45,20 @@
<arg>--master</arg><arg>yarn</arg> <arg>--master</arg><arg>yarn</arg>
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg> <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg> <arg>--workingPath</arg><arg>${workingDir}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
</spark> </spark>
<ok to="ResolveEntities"/> <ok to="decision_resolveEntities"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<decision name="decision_resolveEntities">
<switch>
<case to="copy_result">${wf:conf('shouldResolveEntities') eq false}</case>
<case to="ResolveEntities">${wf:conf('shouldResolveEntities') eq true}</case>
<default to="ResolveEntities"/>
</switch>
</decision>
<action name="ResolveEntities"> <action name="ResolveEntities">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -62,11 +80,91 @@
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg> <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--unresolvedPath</arg><arg>${unresolvedPath}</arg> <arg>--unresolvedPath</arg><arg>${unresolvedPath}</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg> <arg>--workingPath</arg><arg>${workingDir}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
</spark> </spark>
<ok to="End"/> <ok to="copy_entities"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <fork name="copy_result">
<path start="copy_publication"/>
<path start="copy_dataset"/>
<path start="copy_otherresearchproduct"/>
<path start="copy_software"/>
</fork>
<action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/publication</arg>
<arg>${nameNode}/${targetPath}/publication</arg>
</distcp>
<ok to="copy_wait_result"/>
<error to="Kill"/>
</action>
<action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/dataset</arg>
<arg>${nameNode}/${targetPath}/dataset</arg>
</distcp>
<ok to="copy_wait_result"/>
<error to="Kill"/>
</action>
<action name="copy_otherresearchproduct">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/otherresearchproduct</arg>
<arg>${nameNode}/${targetPath}/otherresearchproduct</arg>
</distcp>
<ok to="copy_wait_result"/>
<error to="Kill"/>
</action>
<action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/software</arg>
<arg>${nameNode}/${targetPath}/software</arg>
</distcp>
<ok to="copy_wait_result"/>
<error to="Kill"/>
</action>
<join name="copy_wait_result" to="copy_entities"/>
<fork name="copy_entities">
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasource"/>
</fork>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/organization</arg>
<arg>${nameNode}/${targetPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/project</arg>
<arg>${nameNode}/${targetPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasource">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/datasource</arg>
<arg>${nameNode}/${targetPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="End"/>
<end name="End"/>
</workflow-app> </workflow-app>

View File

@ -2,5 +2,6 @@
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"u", "paramLongName":"unresolvedPath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"u", "paramLongName":"unresolvedPath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true} {"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target path", "paramRequired": true}
] ]

View File

@ -1,5 +1,6 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true} {"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target path", "paramRequired": true}
] ]

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap package eu.dnetlib.dhp.oa.graph.hostedbymap
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {} case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}

View File

@ -2,13 +2,12 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkApplyHostedByMapToResult.{applyHBtoPubs, getClass}
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Datasource, Publication} import eu.dnetlib.dhp.schema.oaf.Datasource
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}

View File

@ -5,16 +5,14 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.schema.oaf.{Datasource, Instance, OpenAccessRoute, Publication} import eu.dnetlib.dhp.schema.oaf.{Instance, OpenAccessRoute, Publication}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
object SparkApplyHostedByMapToResult { object SparkApplyHostedByMapToResult {
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = { def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
@ -25,7 +23,7 @@ object SparkApplyHostedByMapToResult {
val ei: EntityInfo = t2._2 val ei: EntityInfo = t2._2
val i = p.getInstance().asScala val i = p.getInstance().asScala
if (i.size == 1) { if (i.size == 1) {
val inst: Instance = i(0) val inst: Instance = i.head
inst.getHostedby.setKey(ei.getHostedById) inst.getHostedby.setKey(ei.getHostedById)
inst.getHostedby.setValue(ei.getName) inst.getHostedby.setValue(ei.getName)
if (ei.getOpenAccess) { if (ei.getOpenAccess) {
@ -39,6 +37,7 @@ object SparkApplyHostedByMapToResult {
p p
})(Encoders.bean(classOf[Publication])) })(Encoders.bean(classOf[Publication]))
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {

View File

@ -3,18 +3,15 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import eu.dnetlib.dhp.schema.oaf.{Journal, Publication} import eu.dnetlib.dhp.schema.oaf.{Journal, Publication}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkPrepareHostedByInfoToApply { object SparkPrepareHostedByInfoToApply {
implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo]) implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])

View File

@ -1,22 +1,20 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap package eu.dnetlib.dhp.oa.graph.hostedbymap
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel} import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel}
import eu.dnetlib.dhp.schema.oaf.Datasource import eu.dnetlib.dhp.schema.oaf.Datasource
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import java.io.PrintWriter import java.io.PrintWriter
import org.apache.hadoop.io.compress.GzipCodec
object SparkProduceHostedByMap { object SparkProduceHostedByMap {
@ -52,7 +50,6 @@ object SparkProduceHostedByMap {
} }
def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = { def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = {
if (issn != null) { if (issn != null) {
if (eissn != null) { if (eissn != null) {
@ -163,7 +160,6 @@ object SparkProduceHostedByMap {
} }
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = { def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = {
val conf = new Configuration() val conf = new Configuration()
@ -182,7 +178,6 @@ object SparkProduceHostedByMap {
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass) val logger: Logger = LoggerFactory.getLogger(getClass)

View File

@ -4,17 +4,11 @@ import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.schema.common.ModelSupport import eu.dnetlib.dhp.schema.common.ModelSupport
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.io.IOUtils
import org.apache.commons.lang3.StringUtils
import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.HttpClients
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source

View File

@ -2,9 +2,8 @@ package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.schema.common.EntityType import eu.dnetlib.dhp.schema.common.EntityType
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
@ -36,21 +35,15 @@ object SparkResolveEntities {
val unresolvedPath = parser.get("unresolvedPath") val unresolvedPath = parser.get("unresolvedPath")
log.info(s"unresolvedPath -> $unresolvedPath") log.info(s"unresolvedPath -> $unresolvedPath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.mkdirs(new Path(workingPath)) fs.mkdirs(new Path(workingPath))
resolveEntities(spark, workingPath, unresolvedPath) resolveEntities(spark, workingPath, unresolvedPath)
generateResolvedEntities(spark, workingPath, graphBasePath) generateResolvedEntities(spark, workingPath, graphBasePath, targetPath)
// TO BE conservative we keep the original entities in the working dir
// and save the resolved entities on the graphBasePath
//In future these lines of code should be removed
entities.foreach {
e =>
fs.rename(new Path(s"$graphBasePath/$e"), new Path(s"$workingPath/${e}_old"))
fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e"))
}
} }
@ -81,27 +74,32 @@ def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: St
} }
} }
def generateResolvedEntities(spark:SparkSession, workingPath: String, graphBasePath:String) = { def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._ import spark.implicits._
val re:Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result] val re: Dataset[(String, Result)] = spark.read.load(s"$workingPath/resolvedEntities").as[Result].map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
entities.foreach { entities.foreach {
e => e => {
val currentEntityDataset: Dataset[(String, Result)] = spark.read.text(s"$graphBasePath/$e").as[String].map(s => deserializeObject(s, e)).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
currentEntityDataset.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left").map(k => {
val a = k._1
val b = k._2
if (b == null)
a._2
else {
a._2.mergeFrom(b._2)
a._2
}
}).map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$targetPath/$e")
}
spark.read.text(s"$graphBasePath/$e").as[String]
.map(s => deserializeObject(s, e))
.union(re)
.groupByKey(_.getId)
.reduceGroups {
(x, y) =>
x.mergeFrom(y)
x
}.map(_._2)
.filter(r => r.getClass.getSimpleName.toLowerCase != "result")
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingPath/resolvedGraph/$e")
} }
} }
} }

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.schema.oaf.{Relation, Result} import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.{FileSystem, Path}
@ -35,6 +35,9 @@ object SparkResolveRelation {
val workingPath = parser.get("workingPath") val workingPath = parser.get("workingPath")
log.info(s"workingPath -> $workingPath") log.info(s"workingPath -> $workingPath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
import spark.implicits._ import spark.implicits._
@ -80,20 +83,13 @@ object SparkResolveRelation {
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(s"$workingPath/relation_resolved") .save(s"$workingPath/relation_resolved")
// TO BE conservative we keep the original relation in the working dir
// and save the relation resolved on the graphBasePath
//In future this two line of code should be removed
fs.rename(new Path(s"$graphBasePath/relation"), new Path(s"$workingPath/relation"))
spark.read.load(s"$workingPath/relation_resolved").as[Relation] spark.read.load(s"$workingPath/relation_resolved").as[Relation]
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")) .filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
.map(r => mapper.writeValueAsString(r)) .map(r => mapper.writeValueAsString(r))
.write .write
.option("compression", "gzip") .option("compression", "gzip")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.text(s"$graphBasePath/relation") .text(s"$targetPath/relation")
} }
def extractInstanceCF(input: String): List[(String, String)] = { def extractInstanceCF(input: String): List[(String, String)] = {

View File

@ -18,7 +18,6 @@ object SparkDataciteToOAF {
.config(conf) .config(conf)
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
import spark.implicits._
val sc = spark.sparkContext val sc = spark.sparkContext

View File

@ -2,7 +2,7 @@ package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.Result
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf import org.apache.spark.SparkConf

View File

@ -5,10 +5,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.scholix.Scholix
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import org.apache.hadoop.io.compress._
object SparkConvertObjectToJson { object SparkConvertObjectToJson {

View File

@ -2,11 +2,12 @@ package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Software,Dataset => OafDataset}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkConvertRDDtoDataset { object SparkConvertRDDtoDataset {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {

View File

@ -1,14 +1,12 @@
package eu.dnetlib.dhp.sx.graph package eu.dnetlib.dhp.sx.graph
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset} import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkCreateInputGraph { object SparkCreateInputGraph {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
@ -41,9 +39,6 @@ object SparkCreateInputGraph {
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")

View File

@ -9,7 +9,7 @@ import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.count import org.apache.spark.sql.functions.count
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkCreateScholix { object SparkCreateScholix {

View File

@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkCreateSummaryObject { object SparkCreateSummaryObject {

View File

@ -5,6 +5,7 @@ import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import java.util.regex.Pattern import java.util.regex.Pattern
import scala.language.postfixOps import scala.language.postfixOps
import scala.xml.{Elem, Node, XML} import scala.xml.{Elem, Node, XML}

View File

@ -2,11 +2,11 @@ package eu.dnetlib.dhp.sx.graph.pangaea
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
object SparkGeneratePanagaeaDataset { object SparkGeneratePanagaeaDataset {
@ -46,7 +46,4 @@ object SparkGeneratePanagaeaDataset {
} }
} }

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.sx.graph.scholix package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.scholix._ import eu.dnetlib.dhp.schema.sx.scholix._
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology} import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
@ -10,10 +9,8 @@ import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
import scala.language.postfixOps
object ScholixUtils { object ScholixUtils {
@ -21,6 +18,7 @@ object ScholixUtils {
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier" val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
val DATE_RELATION_KEY: String = "RelationDate" val DATE_RELATION_KEY: String = "RelationDate"
case class RelationVocabulary(original: String, inverse: String) {} case class RelationVocabulary(original: String, inverse: String) {}
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {} case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
@ -66,7 +64,6 @@ object ScholixUtils {
} }
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable { val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
override def zero: RelatedEntities = null override def zero: RelatedEntities = null
@ -85,8 +82,7 @@ object ScholixUtils {
if (b1 != null && b2 != null) if (b1 != null && b2 != null)
RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication) RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication)
else else if (b1 != null)
if (b1!= null)
b1 b1
else else
b2 b2
@ -144,7 +140,6 @@ object ScholixUtils {
s s
} }
@ -203,8 +198,7 @@ object ScholixUtils {
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty) if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
r.setPublicationDate(summaryObject.getDate.get(0)) r.setPublicationDate(summaryObject.getDate.get(0))
if (summaryObject.getPublisher!= null && !summaryObject.getPublisher.isEmpty) if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
{
val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (plist.nonEmpty) if (plist.nonEmpty)
@ -228,9 +222,6 @@ object ScholixUtils {
} }
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = { def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
if (relation == null || source == null) if (relation == null || source == null)

View File

@ -12,6 +12,8 @@ import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.MappableBlock;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtendWith;
@ -66,9 +68,59 @@ public class GraphCleaningFunctionsTest {
Relation r_out = OafCleaner.apply(r_in, mapping); Relation r_out = OafCleaner.apply(r_in, mapping);
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_out.getRelClass())); assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_out.getRelClass()));
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r_out.getSubRelType())); assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r_out.getSubRelType()));
assertEquals("iis", r_out.getDataInfo().getProvenanceaction().getClassid());
assertEquals("Inferred by OpenAIRE", r_out.getDataInfo().getProvenanceaction().getClassname());
} }
} }
@Test
void testFilter_false() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(false, GraphCleaningFunctions.filter(p_in));
}
@Test
void testFilter_true() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(true, GraphCleaningFunctions.filter(p_in));
}
@Test
void testFilter_missing_invisible() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(true, GraphCleaningFunctions.filter(p_in));
}
@Test @Test
void testCleaning() throws Exception { void testCleaning() throws Exception {
@ -99,6 +151,12 @@ public class GraphCleaningFunctionsTest {
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
assertEquals("0027", p_out.getInstance().get(1).getInstancetype().getClassid());
assertEquals("Model", p_out.getInstance().get(1).getInstancetype().getClassname());
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassid());
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassname());
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid()); assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname()); assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
@ -112,7 +170,7 @@ public class GraphCleaningFunctionsTest {
List<Instance> poi = p_out.getInstance(); List<Instance> poi = p_out.getInstance();
assertNotNull(poi); assertNotNull(poi);
assertEquals(1, poi.size()); assertEquals(3, poi.size());
final Instance poii = poi.get(0); final Instance poii = poi.get(0);
assertNotNull(poii); assertNotNull(poii);
@ -140,7 +198,7 @@ public class GraphCleaningFunctionsTest {
assertEquals(5, p_out.getTitle().size()); assertEquals(5, p_out.getTitle().size());
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out); Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
assertEquals(3, p_cleaned.getTitle().size()); assertEquals(3, p_cleaned.getTitle().size());
@ -159,9 +217,12 @@ public class GraphCleaningFunctionsTest {
assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue()); assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue());
assertEquals("0038", p_cleaned.getInstance().get(2).getInstancetype().getClassid());
assertEquals("Other literature type", p_cleaned.getInstance().get(2).getInstancetype().getClassname());
final List<Instance> pci = p_cleaned.getInstance(); final List<Instance> pci = p_cleaned.getInstance();
assertNotNull(pci); assertNotNull(pci);
assertEquals(1, pci.size()); assertEquals(3, pci.size());
final Instance pcii = pci.get(0); final Instance pcii = pci.get(0);
assertNotNull(pcii); assertNotNull(pcii);
@ -222,4 +283,27 @@ public class GraphCleaningFunctionsTest {
.readLines( .readLines(
GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
} }
@Test
public void testCleanDoiBoost() throws IOException {
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
}
@Test
public void testCleanDoiBoost2() throws IOException {
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
}
} }

View File

@ -171,24 +171,6 @@ public class DumpJobTest {
GraphResult gr = verificationDataset.first(); GraphResult gr = verificationDataset.first();
Assertions.assertEquals(2, gr.getMeasures().size());
Assertions
.assertTrue(
gr
.getMeasures()
.stream()
.anyMatch(
m -> m.getKey().equals("influence")
&& m.getValue().equals("1.62759106106e-08")));
Assertions
.assertTrue(
gr
.getMeasures()
.stream()
.anyMatch(
m -> m.getKey().equals("popularity")
&& m.getValue().equals("0.22519296")));
Assertions.assertEquals(6, gr.getAuthor().size()); Assertions.assertEquals(6, gr.getAuthor().size());
Assertions Assertions
.assertTrue( .assertTrue(

View File

@ -708,7 +708,7 @@ class MappersTest {
assertEquals(1, p.getTitle().size()); assertEquals(1, p.getTitle().size());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
final Publication p_cleaned = cleanup(fixVocabularyNames(p)); final Publication p_cleaned = cleanup(fixVocabularyNames(p), vocs);
assertNotNull(p_cleaned.getTitle()); assertNotNull(p_cleaned.getTitle());
assertFalse(p_cleaned.getTitle().isEmpty()); assertFalse(p_cleaned.getTitle().isEmpty());

Some files were not shown because too many files have changed in this diff Show More