1
0
Fork 0

Merge branch 'beta' into affiliationPropagation

This commit is contained in:
Claudio Atzori 2021-12-14 15:26:56 +01:00
commit 1790fa2d44
125 changed files with 5056 additions and 1362 deletions

2
.gitignore vendored
View File

@ -3,8 +3,6 @@
*.iws
*.ipr
*.iml
*.ipr
*.iws
*~
.vscode
.metals

View File

@ -22,9 +22,20 @@
<id>dnet45-releases</id>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
</site>
</distributionManagement>
<build>
<extensions>
<extension>
<groupId>org.apache.maven.wagon</groupId>
<artifactId>wagon-ssh</artifactId>
<version>2.10</version>
</extension>
</extensions>
<pluginManagement>
<plugins>
<plugin>
@ -35,7 +46,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
<version>3.9.1</version>
</plugin>
</plugins>
</pluginManagement>
@ -43,6 +54,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties>
</project>

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -10,6 +10,9 @@
<packaging>pom</packaging>
<description>This module is a container for the build tools used in dnet-hadoop</description>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
</properties>
<modules>
<module>dhp-code-style</module>
@ -17,4 +20,12 @@
<module>dhp-build-properties-maven-plugin</module>
</modules>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/</url>
</site>
</distributionManagement>
</project>

View File

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -13,7 +13,51 @@
<artifactId>dhp-common</artifactId>
<packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>

View File

@ -57,9 +57,17 @@ public class VocabularyGroup implements Serializable {
final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn);
}
}
// add the term names as synonyms
vocs.vocs.values().forEach(voc -> {
voc.getTerms().values().forEach(term -> {
voc.addSynonym(term.getName().toLowerCase(), term.getId());
});
});
return vocs;
}

View File

@ -16,6 +16,8 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
@ -86,6 +88,22 @@ public class GraphCleaningFunctions extends CleaningFunctions {
}
public static <T extends Oaf> boolean filter(T value) {
if (Boolean.TRUE
.equals(
Optional
.ofNullable(value)
.map(
o -> Optional
.ofNullable(o.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getInvisible())
.orElse(true))
.orElse(true))
.orElse(true))) {
return false;
}
if (value instanceof Datasource) {
// nothing to evaluate here
} else if (value instanceof Project) {
@ -115,7 +133,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return true;
}
public static <T extends Oaf> T cleanup(T value) {
public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
@ -212,6 +230,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getFormat())) {
r
.setFormat(
r
.getFormat()
.stream()
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) {
r
.setDescription(
@ -234,6 +261,38 @@ public class GraphCleaningFunctions extends CleaningFunctions {
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
if (r instanceof Publication) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Dataset) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Software) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof OtherResearchProduct) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
}
}
if (Objects.nonNull(i.getPid())) {
i.setPid(processPidCleaning(i.getPid()));
}

View File

@ -0,0 +1,72 @@
package eu.dnetlib.dhp.application
import scala.io.Source
/**
* This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*
*/
trait SparkScalaApplication {
/**
* This is the path in the classpath of the json
* describes all the argument needed to run
*/
val propertyPath: String
/**
* Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
parser.parseArgument(args)
parser
}
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
def run(): Unit
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
var spark:SparkSession = null
def initialize():SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
/**
* Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
private def createSparkSession():SparkSession = {
require(parser!= null)
val conf:SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
}
}

View File

@ -107,7 +107,7 @@ class OafMapperUtilsTest {
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());

View File

@ -1,69 +0,0 @@
package eu.dnetlib.dhp.actionmanager.scholix
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object SparkCreateActionset {
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val workingDirFolder = parser.get("workingDirFolder")
log.info(s"workingDirFolder -> $workingDirFolder")
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
log.info("extract source and target Identifier involved in relations")
log.info("save relation filtered")
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
log.info("saving entities")
val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
entities
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
.map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
}
}

View File

@ -1,86 +0,0 @@
package eu.dnetlib.dhp.actionmanager.scholix
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object SparkSaveActionSet {
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/save_actionset.json")).mkString)
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
spark.read.load(sourcePath).as[Oaf]
.map(o => toActionSet(o))
.filter(o => o != null)
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
}
}

View File

@ -7,3 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.logger.org.apache.spark=FATAL
log4j.logger.org.spark_project=FATAL

View File

@ -0,0 +1,134 @@
package eu.dnetlib.dhp.datacite
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
import java.io.InputStream
import java.time.format.DateTimeFormatter
import java.util.Locale
import java.util.regex.Pattern
import scala.io.Source
/**
* This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI
* @param timestamp timestamp of last update date
* @param isActive the record is active or deleted
* @param json the json native records
*/
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
/*
The following class are utility class used for the mapping from
json datacite to OAF Shema
*/
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class OAFRelations(relation:String, inverse:String, relType:String)
class DataciteModelConstants extends Serializable {
}
object DataciteModelConstants {
val REL_TYPE_VALUE:String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val DATACITE_NAME = "Datacite"
val dataInfo: DataInfo = dataciteDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val subRelTypeMapping: Map[String,OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
)
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
require(stream!= null)
Source.fromInputStream(stream).getLines().toList
}
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
}

View File

@ -2,131 +2,42 @@ package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.datacite.DataciteModelConstants._
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.regex.Pattern
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
import scala.language.postfixOps
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
object DataciteToOAFTransformation {
val REL_TYPE_VALUE:String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val subRelTypeMapping: Map[String,(String,String)] = Map(
"References" ->("IsReferencedBy","relationship"),
"IsSupplementTo" ->("IsSupplementedBy","supplement"),
"IsPartOf" ->("HasPart","part"),
"HasPart" ->("IsPartOf","part"),
"IsVersionOf" ->("HasVersion","version"),
"HasVersion" ->("IsVersionOf","version"),
"IsIdenticalTo" ->("IsIdenticalTo","relationship"),
"IsPreviousVersionOf" ->("IsNewVersionOf","version"),
"IsContinuedBy" ->("Continues","relationship"),
"Continues" ->("IsContinuedBy","relationship"),
"IsNewVersionOf" ->("IsPreviousVersionOf","version"),
"IsSupplementedBy" ->("IsSupplementTo","supplement"),
"IsDocumentedBy" ->("Documents","relationship"),
"IsSourceOf" ->("IsDerivedFrom","relationship"),
"Cites" ->("IsCitedBy","citation"),
"IsCitedBy" ->("Cites","citation"),
"IsDerivedFrom" ->("IsSourceOf","relationship"),
"IsVariantFormOf" ->("IsDerivedFrom","version"),
"IsReferencedBy" ->("References","relationship"),
"IsObsoletedBy" ->("IsNewVersionOf","version"),
"Reviews" ->("IsReviewedBy","review"),
"Documents" ->("IsDocumentedBy","relationship"),
"IsCompiledBy" ->("Compiles","relationship"),
"Compiles" ->("IsCompiledBy","relationship"),
"IsReviewedBy" ->("Reviews","review")
)
implicit val codec: Codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val j_filter: List[String] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
s.lines.toList
}
val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val dataInfo: DataInfo = generateDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
val hostedByMap: Map[String, HostedByMapType] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(s)
json.extract[Map[String, HostedByMapType]]
}
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
def filter_json(json: String): Boolean = {
j_filter.exists(f => json.contains(f))
/**
* This method should skip record if json contains invalid text
* defined in gile datacite_filter
*
* @param json
* @return True if the record should be skipped
*/
def skip_record(json: String): Boolean = {
datacite_filter.exists(f => json.contains(f))
}
@deprecated("this method will be removed", "dhp")
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
@ -197,15 +108,17 @@ object DataciteToOAFTransformation {
d
}
def fix_thai_date(input:String, format:String) :String = {
def fix_thai_date(input: String, format: String): String = {
try {
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
@ -324,11 +237,7 @@ object DataciteToOAFTransformation {
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo)
// REMOVED INVERSE RELATION since there is a specific method that should generate later
// generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
)
List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo))
}
else
List()
@ -337,7 +246,7 @@ object DataciteToOAFTransformation {
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
if (filter_json(input))
if (skip_record(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -427,15 +336,15 @@ object DataciteToOAFTransformation {
.map(d => d.get)
if (a_date.isDefined) {
if(doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
if (doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null))
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
if (doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
@ -443,9 +352,9 @@ object DataciteToOAFTransformation {
}
}
else if (publication_year != null) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
if (doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null))
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
@ -516,8 +425,8 @@ object DataciteToOAFTransformation {
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
if (client.isDefined) {
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
@ -549,7 +458,7 @@ object DataciteToOAFTransformation {
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations(rels,result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
relations = relations ::: generateRelations(rels, result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
@ -558,7 +467,7 @@ object DataciteToOAFTransformation {
List(result)
}
private def generateRelations(rels: List[RelatedIdentifierType], id:String, date:String):List[Relation] = {
private def generateRelations(rels: List[RelatedIdentifierType], id: String, date: String): List[Relation] = {
rels
.filter(r =>
subRelTypeMapping.contains(r.relationType) && (
@ -571,32 +480,23 @@ object DataciteToOAFTransformation {
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType)._2
val subRelType = subRelTypeMapping(r.relationType).relType
rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType)
rel.setRelClass(r.relationType)
val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
rel.setSource(id)
rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier,r.relatedIdentifierType))
rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType))
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel
}).toList
})
}
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
di
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")

View File

@ -1,64 +1,94 @@
package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.application.AbstractScalaApplication
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH
import eu.dnetlib.dhp.common.Constants.MDSTORE_SIZE_PATH
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object GenerateDataciteDatasetSpark {
class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
log.info(s"SourcePath is '$sourcePath'")
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
log.info(s"exportLinks is '$exportLinks'")
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
.master(master)
.getOrCreate()
require(vocabularies != null)
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
val mapper = new ObjectMapper()
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info(s"outputBasePath is '$outputBasePath'")
val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH"
log.info(s"targetPath is '$targetPath'")
generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark)
reportTotalSize(targetPath, outputBasePath)
}
/**
* For working with MDStore we need to store in a file on hdfs the size of
* the current dataset
* @param targetPath
* @param outputBasePath
*/
def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
val total_items = spark.read.load(targetPath).count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
}
/**
* Generate the transformed and cleaned OAF Dataset from the native one
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
* @param exportLinks If true it generates unresolved links
* @param vocabularies vocabularies for cleaning
* @param targetPath the targetPath of the result Dataset
*/
def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
require(spark!= null)
import spark.implicits._
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
val mapper = new ObjectMapper()
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info("outputBasePath: {}", outputBasePath)
val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH"
spark.read.load(sourcePath).as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
.filter(d => d != null)
.flatMap(i => fixRelations(i)).filter(i => i != null)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
val total_items = spark.read.load(targetPath).as[Oaf].count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
}
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
}
}

View File

@ -7,6 +7,7 @@ import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.{compact, parse, render}
import collection.JavaConverters._
object BioDBToOAF {
case class EBILinkItem(id: Long, links: String) {}

View File

@ -1,9 +1,9 @@
package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
import BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
@ -36,13 +36,13 @@ object SparkTransformBioDatabaseToOAF {
import spark.implicits._
database.toUpperCase() match {
case "UNIPROT" =>
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
case "PDB" =>
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
case "SCHOLIX" =>
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
case "CROSSREF_LINKS" =>
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.Result
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
import eu.dnetlib.dhp.sx.bio.pubmed._
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration

View File

@ -1,9 +1,8 @@
package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal
import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import org.apache.commons.io.IOUtils
import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.HttpGet

View File

@ -1,11 +1,10 @@
package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.collection.CollectionUtils
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
import BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.collection.CollectionUtils
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql._
@ -38,7 +37,7 @@ object SparkEBILinksToOaf {
ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p))
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
.flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null)
.flatMap(i => CollectionUtils.fixRelations(i)).filter(i => i != null)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -4,7 +4,7 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf._
import scala.collection.JavaConverters._
import collection.JavaConverters._
import java.util.regex.Pattern
@ -22,10 +22,10 @@ object PubMedToOaf {
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
/**
* Cleaning the DOI Applying regex in order to
* remove doi starting with URL
*
* @param doi input DOI
* @return cleaned DOI
*/
@ -93,7 +93,6 @@ object PubMedToOaf {
* @param vocabularyName the input vocabulary name
* @param vocabularies all the vocabularies
* @param term the term to find
*
* @return the cleaned term value
*/
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
@ -106,7 +105,6 @@ object PubMedToOaf {
/**
* Map the Pubmed Article into the OAF instance
*
*
* @param article the pubmed articles
* @param vocabularies the vocabularies
* @return The OAF instance if the mapping did not fail
@ -185,7 +183,6 @@ object PubMedToOaf {
//--------------------------------------------------------------------------------------
// RESULT MAPPING
//--------------------------------------------------------------------------------------
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))

View File

@ -1,9 +1,20 @@
##DHP-Aggregation
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
This module defines a set of oozie workflows for
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
1. the **collection** and **transformation** of metadata records.
2. the **integration** of new external information in the result
### Collection and Transformation
The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore.
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
### Integration of external information in the result
The workflows create new entity in the OpenAIRE format (OAF) which aim is to enrich the result already contained in the graph.
See integration section for more insight

View File

@ -0,0 +1,36 @@
DHP Aggregation - Integration method
=====================================
The integration method can be applied every time new information, which is not aggregated from the repositories
nor computed directly by OpenAIRE, should be added to the results of the graph.
The information integrated so far is:
1. Article impact measures
1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores
2. Result Subjects
1. Integration of Fields od Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS)) classification in
results subjects.
The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id
and the element in the OAF model that should be used to map the information we want to integrate.
The id is set by using a particular encoding of the given PID
*unresolved:[pid]:[pidtype]*
where
1. *unresolved* is a constant value
2. *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307
3. *pidtype* is the persistent id type, e.g. doi
Such entities are matched against those available in the graph using the result.instance.pid values.
This mechanism can be used to integrate enrichments produced as associated by a given PID.
If a match will be found with one of the results already in the graph that said result will be enriched with the information
present in the new OAF.
All the objects for which a match is not found are discarded.

View File

@ -4,13 +4,13 @@ This section describes the mapping implemented for [MEDLINE/PubMed](https://pubm
Collection
---------
The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with
the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
Parsing
-------
The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
Mapping
@ -50,6 +50,10 @@ The table below describes the mapping from the XML Native to the OAF mapping
|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
|FOR ALL AUTHOR | author.rank| sequential number starting from 1|
#TODO
Missing item mapped

View File

@ -20,7 +20,9 @@
<item name="Pubmed" href="pubmed.html"/>
<item name="Datacite" href="datacite.html"/>
</item>
<item name="Release Notes" href="release-notes.html" />
<item name="Integration" href="integration.html" collapse="true">
</item>
<item name="General Information" href="about.html"/>
<item name="JavaDoc" href="apidocs/" />

View File

@ -89,13 +89,13 @@ public class CreateOpenCitationsASTest {
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
workingDir.toString() + "/actionSet1"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.sequenceFile(workingDir.toString() + "/actionSet1", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
@ -121,13 +121,13 @@ public class CreateOpenCitationsASTest {
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
workingDir.toString() + "/actionSet2"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.sequenceFile(workingDir.toString() + "/actionSet2", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
@ -153,13 +153,13 @@ public class CreateOpenCitationsASTest {
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
workingDir.toString() + "/actionSet3"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.sequenceFile(workingDir.toString() + "/actionSet3", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
@ -186,13 +186,13 @@ public class CreateOpenCitationsASTest {
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
workingDir.toString() + "/actionSet4"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.sequenceFile(workingDir.toString() + "/actionSet4", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
@ -226,13 +226,13 @@ public class CreateOpenCitationsASTest {
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
workingDir.toString() + "/actionSet5"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.sequenceFile(workingDir.toString() + "/actionSet5", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
@ -261,13 +261,13 @@ public class CreateOpenCitationsASTest {
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
workingDir.toString() + "/actionSet6"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.sequenceFile(workingDir.toString() + "/actionSet6", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
@ -306,13 +306,13 @@ public class CreateOpenCitationsASTest {
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
workingDir.toString() + "/actionSet7"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.sequenceFile(workingDir.toString() + "/actionSet7", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));

View File

@ -1,56 +0,0 @@
package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import java.text.SimpleDateFormat
import java.util.Locale
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class DataciteToOAFTest extends AbstractVocabularyTest{
@BeforeEach
def setUp() :Unit = {
super.setUpVocabulary()
}
@Test
def testDateMapping:Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
}
@Test
def testMapping() :Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
res.foreach(r => {
println (mapper.writeValueAsString(r))
println("----------------------------")
})
}
}

View File

@ -0,0 +1,114 @@
package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.{col, count}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.{Files, Path}
import java.text.SimpleDateFormat
import java.util.Locale
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class DataciteToOAFTest extends AbstractVocabularyTest{
private var workingDir:Path = null
val log: Logger = LoggerFactory.getLogger(getClass)
@BeforeEach
def setUp() :Unit = {
workingDir= Files.createTempDirectory(getClass.getSimpleName)
super.setUpVocabulary()
}
@AfterEach
def tearDown() :Unit = {
FileUtils.deleteDirectory(workingDir.toFile)
}
@Test
def testDateMapping:Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
}
@Test
def testConvert(): Unit = {
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
val conf = new SparkConf()
val spark:SparkSession = SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
val instance = new GenerateDataciteDatasetSpark(null, null, log)
val targetPath = s"$workingDir/result"
instance.generateDataciteDataset(path, exportLinks = true, vocabularies,targetPath, spark)
import spark.implicits._
val nativeSize =spark.read.load(path).count()
assertEquals(100, nativeSize)
val result:Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false)
val t = spark.read.load(targetPath).count()
assertTrue(t >0)
spark.stop()
}
@Test
def testMapping() :Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
res.foreach(r => {
println (mapper.writeValueAsString(r))
println("----------------------------")
})
}
}

View File

@ -90,7 +90,7 @@ public class ReadBlacklistFromDB implements Closeable {
inverse.setSource(target_direct);
String encoding = rs.getString("relationship");
RelationInverse ri = ModelSupport.relationInverseMap.get(encoding);
RelationInverse ri = ModelSupport.findInverse(encoding);
direct.setRelClass(ri.getRelClass());
inverse.setRelClass(ri.getInverseRelClass());
direct.setRelType(ri.getRelType());

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.blacklist;
import java.util.Arrays;
import java.util.List;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.common.RelationInverse;
public class BlacklistRelationTest {
@Test
public void testRelationInverseLookup() {
final List<String> rels = Arrays
.asList(
"resultResult_relationship_IsRelatedTo",
"resultOrganization_affiliation_isAuthorInstitutionOf",
"resultOrganization_affiliation_hasAuthorInstitution",
"datasourceOrganization_provision_isProvidedBy",
"projectOrganization_participation_hasParticipant",
"resultProject_outcome_produces",
"resultProject_outcome_isProducedBy");
rels.forEach(r -> {
RelationInverse inverse = ModelSupport.relationInverseMap.get(r);
Assertions.assertNotNull(inverse);
Assertions.assertNotNull(inverse.getRelType());
Assertions.assertNotNull(inverse.getSubReltype());
Assertions.assertNotNull(inverse.getRelClass());
});
}
}

View File

@ -19,7 +19,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
@ExtendWith(MockitoExtension.class)
class UpdateMatcherTest {
public class UpdateMatcherTest {
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();

View File

@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
class EnrichMissingPublicationDateTest {
public class EnrichMissingPublicationDateTest {
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();

View File

@ -8,7 +8,7 @@ import java.util.Arrays;
import org.junit.jupiter.api.Test;
class SubscriptionUtilsTest {
public class SubscriptionUtilsTest {
@Test
void testVerifyListSimilar() {

View File

@ -9,7 +9,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
class TrustUtilsTest {
public class TrustUtilsTest {
private static final double THRESHOLD = 0.95;

View File

@ -139,12 +139,26 @@ abstract class AbstractSparkAction implements Serializable {
protected boolean isOpenorgs(Relation rel) {
return Optional
.ofNullable(rel.getCollectedfrom())
.map(
c -> c
.map(c -> isCollectedFromOpenOrgs(c))
.orElse(false);
}
protected boolean isOpenorgsDedupRel(Relation rel) {
return isOpenorgs(rel) && isOpenOrgsDedupMergeRelation(rel);
}
private boolean isCollectedFromOpenOrgs(List<KeyValue> c) {
return c
.stream()
.filter(Objects::nonNull)
.anyMatch(kv -> ModelConstants.OPENORGS_NAME.equals(kv.getValue())))
.orElse(false);
.anyMatch(kv -> ModelConstants.OPENORGS_NAME.equals(kv.getValue()));
}
private boolean isOpenOrgsDedupMergeRelation(Relation rel) {
return ModelConstants.ORG_ORG_RELTYPE.equals(rel.getRelType()) &&
ModelConstants.DEDUP.equals(rel.getSubRelType())
&& (ModelConstants.IS_MERGED_IN.equals(rel.getRelClass()) ||
ModelConstants.MERGES.equals(rel.getRelClass()));
}
protected static Boolean parseECField(Field<String> field) {

View File

@ -61,7 +61,7 @@ public class SparkCopyRelationsNoOpenorgs extends AbstractSparkAction {
.textFile(relationPath)
.map(patchRelFn(), Encoders.bean(Relation.class))
.toJavaRDD()
.filter(x -> !isOpenorgs(x));
.filter(x -> !isOpenorgsDedupRel(x));
if (log.isDebugEnabled()) {
log.debug("Number of non-Openorgs relations collected: {}", simRels.count());

View File

@ -11,6 +11,8 @@ import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
@ -29,6 +31,8 @@ import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
@ -226,9 +230,10 @@ public class SparkOpenorgsProvisionTest implements Serializable {
new SparkCopyRelationsNoOpenorgs(parser, spark).run(isLookUpService);
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
final JavaRDD<String> rels = jsc.textFile(testDedupGraphBasePath + "/relation");
assertEquals(2382, rels.count());
assertEquals(2380, relations);
}
@Test
@ -250,7 +255,7 @@ public class SparkOpenorgsProvisionTest implements Serializable {
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
assertEquals(4894, relations);
assertEquals(4896, relations);
// check deletedbyinference
final Dataset<Relation> mergeRels = spark

View File

@ -2518,3 +2518,5 @@
{"subRelType": "dedup", "relClass": "isMergedIn", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|openorgs____::5c351d85f02db01ca291acd119f0bd78", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|opendoar____::37248e2f6987b18670dd2b8a51d6ef55", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}
{"subRelType": "dedup", "relClass": "merges", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}
{"subRelType": "dedup", "relClass": "isMergedIn", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}
{"subRelType": "relationship", "relClass": "IsParentOf", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}
{"subRelType": "relationship", "relClass": "IsChildOf", "dataInfo": {"provenanceaction": {"classid": "sysimport:crosswalk:entityregistry", "classname": "sysimport:crosswalk:entityregistry", "schemeid": "dnet:provenanceActions", "schemename": "dnet:provenanceActions"}, "deletedbyinference": false, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.990"}, "target": "20|corda_______::6acb33e6ea8c6fcdabc891c80d083c64", "lastupdatetimestamp": 1617801137807, "relType": "organizationOrganization", "source": "20|openorgs____::e38c1a27fcb0f0ab218828e4f5fc7be9", "validationDate": null, "collectedfrom": [{"dataInfo": null, "key": "10|openaire____::0362fcdb3076765d9c0041ad331553e8", "value": "OpenOrgs Database"}], "validated": false, "properties": []}

View File

@ -1,21 +1,19 @@
package eu.dnetlib.doiboost
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{AccessRight, DataInfo, Dataset, Field, Instance, KeyValue, Oaf, OpenAccessRoute, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{getClosedAccessQualifier, getEmbargoedAccessQualifier, getUnknownQualifier}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import scala.collection.JavaConverters._

View File

@ -8,11 +8,12 @@ import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkGenerateDOIBoostActionSet {
val logger: Logger = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
@ -42,44 +43,32 @@ object SparkGenerateDOIBoostActionSet {
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
.filter(p => p != null || p.getId != null)
.map(d =>DoiBoostMappingUtil.fixResult(d))
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.map(d => DoiBoostMappingUtil.fixResult(d))
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val asPublication =spark.read.load(dbPublicationPath).as[Publication]
val asPublication = spark.read.load(dbPublicationPath).as[Publication]
.filter(p => p != null || p.getId != null)
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
.filter(r => r!= null && r.getSource != null && r.getTarget != null)
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.filter(r => r != null && r.getSource != null && r.getTarget != null)
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.map(d => DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
}

View File

@ -9,28 +9,26 @@ import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import org.apache.spark.sql._
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString,JArray}
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkGenerateDoiBoost {
def extractIdGRID(input:String):List[(String,String)] = {
def extractIdGRID(input: String): List[(String, String)] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val id:String = (json \ "id").extract[String]
val id: String = (json \ "id").extract[String]
val grids:List[String] = for {
val grids: List[String] = for {
JObject(pid) <- json \ "pid"
JField("qualifier", JObject(qualifier)) <- pid
JField("classid", JString(classid)) <-qualifier
JField("classid", JString(classid)) <- qualifier
JField("value", JString(vl)) <- pid
if classid == "GRID"
} yield vl
@ -38,7 +36,6 @@ object SparkGenerateDoiBoost {
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
@ -73,7 +70,7 @@ object SparkGenerateDoiBoost {
if (a != null && a._2 != null) {
b.mergeFrom(a._2)
b.setId(a._1)
val authors =AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor)
val authors = AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor)
b.setAuthor(authors)
return b
}
@ -87,11 +84,11 @@ object SparkGenerateDoiBoost {
return b2
}
else {
if (b2 != null ) {
if (b2 != null) {
b1.mergeFrom(b2)
val authors =AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor)
b1.setAuthor(authors)
if (b2.getId!= null && b2.getId.nonEmpty)
if (b2.getId != null && b2.getId.nonEmpty)
b1.setId(b2.getId)
return b1
}
@ -118,10 +115,9 @@ object SparkGenerateDoiBoost {
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
def applyMerge(item:((String, Publication), (String, Publication))) : Publication =
{
def applyMerge(item: ((String, Publication), (String, Publication))): Publication = {
val crossrefPub = item._1._2
if (item._2!= null) {
if (item._2 != null) {
val otherPub = item._2._2
if (otherPub != null) {
crossrefPub.mergeFrom(otherPub)
@ -130,6 +126,7 @@ object SparkGenerateDoiBoost {
}
crossrefPub
}
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
logger.info("Phase 3) Join Result with ORCID")
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
@ -143,9 +140,9 @@ object SparkGenerateDoiBoost {
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
val doiBoostPublication: Dataset[(String,Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
val doiBoostPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p => DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
val hostedByDataset : Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
val hostedByDataset: Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
@ -164,21 +161,20 @@ object SparkGenerateDoiBoost {
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
val a:Dataset[DoiBoostAffiliation] = paperAffiliation
val a: Dataset[DoiBoostAffiliation] = paperAffiliation
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null )
val magPubs: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s => s._1 != null)
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
val pub:Publication = item._1._2
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
val pub: Publication = item._1._2
val affiliation = item._2
val affId:String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
val r:Relation = new Relation
val affId: String = if (affiliation.GridId.isDefined) s"unresolved::grid::${affiliation.GridId.get.toLowerCase}" else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
val r: Relation = new Relation
r.setSource(pub.getId)
r.setTarget(affId)
r.setRelType(ModelConstants.RESULT_ORGANIZATION)
@ -186,7 +182,7 @@ object SparkGenerateDoiBoost {
r.setSubRelType(ModelConstants.AFFILIATION)
r.setDataInfo(pub.getDataInfo)
r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
val r1:Relation = new Relation
val r1: Relation = new Relation
r1.setTarget(pub.getId)
r1.setSource(affId)
r1.setRelType(ModelConstants.RESULT_ORGANIZATION)
@ -198,33 +194,31 @@ object SparkGenerateDoiBoost {
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved")
val unresolvedRels:Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => {
val unresolvedRels: Dataset[(String, Relation)] = spark.read.load(s"$workingDirPath/doiBoostPublicationAffiliation_unresolved").as[Relation].map(r => {
if (r.getSource.startsWith("unresolved"))
(r.getSource, r)
else if (r.getTarget.startsWith("unresolved"))
(r.getTarget,r)
(r.getTarget, r)
else
("resolved", r)
})(Encoders.tuple(Encoders.STRING, mapEncoderRel))
val openaireOrganization:Dataset[(String,String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x,y) => if (x != null) x else y ).map(_._2)
val openaireOrganization: Dataset[(String, String)] = spark.read.text(openaireOrganizationPath).as[String].flatMap(s => extractIdGRID(s)).groupByKey(_._2).reduceGroups((x, y) => if (x != null) x else y).map(_._2)
unresolvedRels.joinWith(openaireOrganization,unresolvedRels("_1").equalTo(openaireOrganization("_2")))
unresolvedRels.joinWith(openaireOrganization, unresolvedRels("_1").equalTo(openaireOrganization("_2")))
.map { x =>
val currentRels = x._1._2
val currentOrgs = x._2
if (currentOrgs!= null)
if(currentRels.getSource.startsWith("unresolved"))
if (currentOrgs != null)
if (currentRels.getSource.startsWith("unresolved"))
currentRels.setSource(currentOrgs._1)
else
currentRels.setTarget(currentOrgs._1)
currentRels
}.filter(r=> !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
}.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => {
magPubs.joinWith(a, magPubs("_1").equalTo(a("PaperId"))).map(item => {
val affiliation = item._2
if (affiliation.GridId.isEmpty) {
val o = new Organization
@ -241,7 +235,7 @@ object SparkGenerateDoiBoost {
}
else
null
}).filter(o=> o!=null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
}).filter(o => o != null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
}
}

View File

@ -4,20 +4,19 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{decideAccessRight, _}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.apache.commons.lang.StringUtils
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JValue, _}
import org.json4s.JsonAST._
import org.json4s.jackson.JsonMethods._
import org.slf4j.{Logger, LoggerFactory}
import java.util
import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
import java.util
import eu.dnetlib.doiboost.DoiBoostMappingUtil
case class CrossrefDT(doi: String, json:String, timestamp: Long) {}

View File

@ -6,7 +6,7 @@ import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
@ -17,12 +17,12 @@ object CrossrefDataset {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
def to_item(input:String):CrossrefDT = {
def to_item(input: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val ts:Long = (json \ "indexed" \ "timestamp").extract[Long]
val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
val ts: Long = (json \ "indexed" \ "timestamp").extract[Long]
val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
CrossrefDT(doi, input, ts)
}
@ -30,7 +30,6 @@ object CrossrefDataset {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
parser.parseArgument(args)
@ -54,7 +53,7 @@ object CrossrefDataset {
return b
if(a.timestamp >b.timestamp) {
if (a.timestamp > b.timestamp) {
return a
}
b
@ -66,7 +65,7 @@ object CrossrefDataset {
if (a == null)
return b
if(a.timestamp >b.timestamp) {
if (a.timestamp > b.timestamp) {
return a
}
b
@ -79,20 +78,20 @@ object CrossrefDataset {
override def finish(reduction: CrossrefDT): CrossrefDT = reduction
}
val workingPath:String = parser.get("workingPath")
val workingPath: String = parser.get("workingPath")
val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
val main_ds: Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
val update =
spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
.map(i =>CrossrefImporter.decompressBlob(i._2.toString))
.map(i =>to_item(i)))
.map(i => CrossrefImporter.decompressBlob(i._2.toString))
.map(i => to_item(i)))
main_ds.union(update).groupByKey(_.doi)
.agg(crossrefAggregator.toColumn)
.map(s=>s._2)
.map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")
}

View File

@ -2,17 +2,12 @@ package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.crossref.CrossrefDataset.to_item
import eu.dnetlib.doiboost.crossref.UnpackCrtossrefEntries.getClass
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST.JArray
import org.json4s.jackson.JsonMethods.{compact, parse, render}
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
@ -24,11 +19,10 @@ object GenerateCrossrefDataset {
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
def crossrefElement(meta: String): CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(meta)
val doi:String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
val timestamp: Long = (json \ "indexed" \ "timestamp").extract[Long]
CrossrefDT(doi, meta, timestamp)
@ -51,14 +45,14 @@ object GenerateCrossrefDataset {
import spark.implicits._
val tmp : RDD[String] = sc.textFile(sourcePath,6000)
val tmp: RDD[String] = sc.textFile(sourcePath, 6000)
spark.createDataset(tmp)
.map(entry => crossrefElement(entry))
.write.mode(SaveMode.Overwrite).save(targetPath)
// .map(meta => crossrefElement(meta))
// .toDS.as[CrossrefDT]
// .write.mode(SaveMode.Overwrite).save(targetPath)
// .map(meta => crossrefElement(meta))
// .toDS.as[CrossrefDT]
// .write.mode(SaveMode.Overwrite).save(targetPath)
}

View File

@ -4,10 +4,8 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}

View File

@ -2,8 +2,8 @@ package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST.JArray
@ -17,9 +17,7 @@ object UnpackCrtossrefEntries {
val log: Logger = LoggerFactory.getLogger(UnpackCrtossrefEntries.getClass)
def extractDump(input:String):List[String] = {
def extractDump(input: String): List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -30,7 +28,6 @@ object UnpackCrtossrefEntries {
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_dump_reader/generate_dataset_params.json")).mkString)
@ -45,7 +42,7 @@ object UnpackCrtossrefEntries {
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.wholeTextFiles(sourcePath,6000).flatMap(d =>extractDump(d._2))
sc.wholeTextFiles(sourcePath, 6000).flatMap(d => extractDump(d._2))
.saveAsTextFile(targetPath, classOf[GzipCodec])

View File

@ -5,10 +5,10 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import scala.collection.JavaConverters._
import scala.collection.mutable

View File

@ -3,8 +3,8 @@ package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkImportMagIntoDataset {
@ -24,13 +24,13 @@ object SparkImportMagIntoDataset {
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")),
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
@ -75,7 +75,6 @@ object SparkImportMagIntoDataset {
.master(parser.get("master")).getOrCreate()
stream.foreach { case (k, v) =>
val s: StructType = getSchema(k)
val df = spark.read

View File

@ -5,22 +5,19 @@ import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.{col, collect_list, struct}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkProcessMAG {
def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={
def getDistinctResults(d: Dataset[MagPapers]): Dataset[MagPapers] = {
d.where(col("Doi").isNotNull)
.groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)
.reduceGroups((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2))
.reduceGroups((p1: MagPapers, p2: MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1, p2))
.map(_._2)(Encoders.product[MagPapers])
.map(mp => {
new MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
mp.DocType, mp.PaperTitle, mp.OriginalTitle,
mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String,
mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId,
@ -102,9 +99,9 @@ object SparkProcessMAG {
val conference = spark.read.load(s"$sourcePath/ConferenceInstances")
.select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate",$"EndDate" )
.select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate", $"EndDate")
val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
.select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate",$"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
.select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate", $"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
@ -146,13 +143,12 @@ object SparkProcessMAG {
.save(s"$workingPath/mag_publication")
spark.read.load(s"$workingPath/mag_publication").as[Publication]
.filter(p => p.getId == null)
.filter(p => p.getId != null)
.groupByKey(p => p.getId)
.reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
.reduceGroups((a: Publication, b: Publication) => ConversionUtil.mergePublication(a, b))
.map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
}
}

View File

@ -4,17 +4,16 @@ import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
import eu.dnetlib.dhp.schema.orcid.{AuthorData, OrcidDOI}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo}
import org.apache.commons.lang.StringUtils
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.JsonAST._
import org.json4s.jackson.JsonMethods._
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
case class ORCIDItem(doi:String, authors:List[OrcidAuthor]){}

View File

@ -11,10 +11,10 @@ object SparkConvertORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
def run(spark:SparkSession, workingPath:String, targetPath:String) :Unit = {
def run(spark: SparkSession, workingPath: String, targetPath: String): Unit = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
import spark.implicits._
val dataset: Dataset[ORCIDItem] =spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
val dataset: Dataset[ORCIDItem] = spark.read.load(s"$workingPath/orcidworksWithAuthor").as[ORCIDItem]
logger.info("Converting ORCID to OAF")
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).write.mode(SaveMode.Overwrite).save(targetPath)
@ -35,7 +35,7 @@ object SparkConvertORCIDToOAF {
val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath")
run(spark,workingPath, targetPath)
run(spark, workingPath, targetPath)
}

View File

@ -1,44 +1,41 @@
package eu.dnetlib.doiboost.orcid
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql.functions.{col, collect_list}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
object SparkPreprocessORCID {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
def fixORCIDItem(item :ORCIDItem):ORCIDItem = {
def fixORCIDItem(item: ORCIDItem): ORCIDItem = {
ORCIDItem(item.doi, item.authors.groupBy(_.oid).map(_._2.head).toList)
}
def run(spark:SparkSession,sourcePath:String,workingPath:String):Unit = {
def run(spark: SparkSession, sourcePath: String, workingPath: String): Unit = {
import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
val inputRDD:RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s!= null).filter(s => ORCIDToOAF.authorValid(s))
val inputRDD: RDD[OrcidAuthor] = spark.sparkContext.textFile(s"$sourcePath/authors").map(s => ORCIDToOAF.convertORCIDAuthor(s)).filter(s => s != null).filter(s => ORCIDToOAF.authorValid(s))
spark.createDataset(inputRDD).as[OrcidAuthor].write.mode(SaveMode.Overwrite).save(s"$workingPath/author")
val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s!= null)
val res = spark.sparkContext.textFile(s"$sourcePath/works").flatMap(s => ORCIDToOAF.extractDOIWorks(s)).filter(s => s != null)
spark.createDataset(res).as[OrcidWork].write.mode(SaveMode.Overwrite).save(s"$workingPath/works")
val authors :Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
val authors: Dataset[OrcidAuthor] = spark.read.load(s"$workingPath/author").as[OrcidAuthor]
val works :Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
val works: Dataset[OrcidWork] = spark.read.load(s"$workingPath/works").as[OrcidWork]
works.joinWith(authors, authors("oid").equalTo(works("oid")))
.map(i =>{
.map(i => {
val doi = i._1.doi
val author = i._2
(doi, author)

View File

@ -1,16 +1,14 @@
package eu.dnetlib.doiboost.uw
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
object SparkMapUnpayWallToOAF {
def main(args: Array[String]): Unit = {
@ -32,11 +30,11 @@ object SparkMapUnpayWallToOAF {
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val inputRDD:RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
val inputRDD: RDD[String] = spark.sparkContext.textFile(s"$sourcePath")
logger.info("Converting UnpayWall to OAF")
val d:Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p=>p!=null)).as[Publication]
val d: Dataset[Publication] = spark.createDataset(inputRDD.map(UnpayWallToOAF.convertToOAF).filter(p => p != null)).as[Publication]
d.write.mode(SaveMode.Overwrite).save(targetPath)
}

View File

@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Instance, OpenAccessRoute, Publication}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
import eu.dnetlib.doiboost.uw.UnpayWallToOAF.get_unpaywall_color

View File

@ -1,70 +0,0 @@
package eu.dnetlib.dhp.doiboost
import eu.dnetlib.dhp.schema.oaf.{Publication, Dataset => OafDataset}
import eu.dnetlib.doiboost.{DoiBoostMappingUtil, HostedByItemType}
import eu.dnetlib.doiboost.SparkGenerateDoiBoost.getClass
import eu.dnetlib.doiboost.mag.ConversionUtil
import eu.dnetlib.doiboost.orcid.ORCIDElement
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Test
import scala.io.Source
class DoiBoostHostedByMapTest {
// @Test
// def testMerge():Unit = {
// val conf: SparkConf = new SparkConf()
// val spark: SparkSession =
// SparkSession
// .builder()
// .config(conf)
// .appName(getClass.getSimpleName)
// .master("local[*]").getOrCreate()
//
//
//
// implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
// implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
// implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
//
//
// import spark.implicits._
// val dataset:RDD[String]= spark.sparkContext.textFile("/home/sandro/Downloads/hbMap.gz")
//
//
// val hbMap:Dataset[(String, HostedByItemType)] =spark.createDataset(dataset.map(DoiBoostMappingUtil.toHostedByItem))
//
//
// hbMap.show()
//
//
//
//
//
//
//
//
//
//
// }
@Test
def idDSGeneration():Unit = {
val s ="doajarticles::0066-782X"
println(DoiBoostMappingUtil.generateDSId(s))
}
}

View File

@ -0,0 +1,20 @@
package eu.dnetlib.dhp.doiboost
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.junit.jupiter.api.Test
class DoiBoostHostedByMapTest {
@Test
def idDSGeneration():Unit = {
val s ="doajarticles::0066-782X"
println(DoiBoostMappingUtil.generateDSId(s))
}
}

View File

@ -1,7 +1,8 @@
package eu.dnetlib.doiboost.crossref
package eu.dnetlib.dhp.doiboost.crossref
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.utils.DHPUtils
import eu.dnetlib.doiboost.crossref.Crossref2Oaf
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
@ -21,9 +22,9 @@ class CrossrefMappingTest {
@Test
def testFunderRelationshipsMapping(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString
val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("funder_doi")).mkString
val funder_name = Source.fromInputStream(getClass.getResourceAsStream("funder_doi")).mkString
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
val funder_doi = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
val funder_name = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi")).mkString
for (line <- funder_doi.lines) {
@ -72,7 +73,7 @@ class CrossrefMappingTest {
@Test
def testOrcidID() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("orcid_data.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/orcid_data.json")).mkString
assertNotNull(json)
@ -93,7 +94,7 @@ class CrossrefMappingTest {
@Test
def testEmptyTitle() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("empty_title.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/empty_title.json")).mkString
assertNotNull(json)
@ -115,7 +116,7 @@ class CrossrefMappingTest {
@Test
def testPeerReviewed(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("prwTest.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/prwTest.json")).mkString
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
assertNotNull(json)
@ -156,7 +157,7 @@ class CrossrefMappingTest {
@Test
def testJournalRelation(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("awardTest.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/awardTest.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty)
@ -177,7 +178,7 @@ class CrossrefMappingTest {
@Test
def testConvertBookFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("book.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/book.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -233,7 +234,7 @@ class CrossrefMappingTest {
@Test
def testConvertPreprintFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("preprint.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/preprint.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -291,7 +292,7 @@ class CrossrefMappingTest {
@Test
def testConvertDatasetFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("dataset.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dataset.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -332,7 +333,7 @@ class CrossrefMappingTest {
@Test
def testConvertArticleFromCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -400,7 +401,7 @@ class CrossrefMappingTest {
@Test
def testSetDateOfAcceptanceCrossRef2Oaf(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("dump_file.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/dump_file.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty);
@ -415,55 +416,12 @@ class CrossrefMappingTest {
assert(items.size == 1)
val result: Result = items.head.asInstanceOf[Publication]
assertNotNull(result)
logger.info(mapper.writeValueAsString(result));
// assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
// assertNotNull(
// result.getDataInfo.getProvenanceaction,
// "DataInfo/Provenance test not null Failed");
// assertFalse(
// result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
// "DataInfo/Provenance/classId test not null Failed");
// assertFalse(
// result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
// "DataInfo/Provenance/className test not null Failed");
// assertFalse(
// result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
// "DataInfo/Provenance/SchemeId test not null Failed");
// assertFalse(
// result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
// "DataInfo/Provenance/SchemeName test not null Failed");
//
// assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
// assertFalse(result.getCollectedfrom.isEmpty);
//
// val collectedFromList = result.getCollectedfrom.asScala
// assert(collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")), "Wrong collected from assertion")
//
// assert(collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")), "Wrong collected from assertion")
//
//
// val relevantDates = result.getRelevantdate.asScala
//
// assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
//
// val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
// assertFalse(rels.isEmpty)
// rels.foreach(relation => {
// assertNotNull(relation)
// assertFalse(relation.getSource.isEmpty)
// assertFalse(relation.getTarget.isEmpty)
// assertFalse(relation.getRelClass.isEmpty)
// assertFalse(relation.getRelType.isEmpty)
// assertFalse(relation.getSubRelType.isEmpty)
//
// })
}
@Test
def testNormalizeDOI(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article_funder_template.json")).mkString
val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
@ -479,7 +437,7 @@ class CrossrefMappingTest {
@Test
def testNormalizeDOI2(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString
val template = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/article.json")).mkString
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
assertTrue(resultList.nonEmpty)
@ -494,7 +452,7 @@ class CrossrefMappingTest {
@Test
def testLicenseVorClosed() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_vor.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_vor.json")).mkString
assertNotNull(json)
@ -521,7 +479,7 @@ class CrossrefMappingTest {
@Test
def testLicenseOpen() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_open.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_open.json")).mkString
assertNotNull(json)
@ -544,7 +502,7 @@ class CrossrefMappingTest {
@Test
def testLicenseEmbargoOpen() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo_open.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_open.json")).mkString
assertNotNull(json)
@ -567,7 +525,7 @@ class CrossrefMappingTest {
@Test
def testLicenseEmbargo() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo.json")).mkString
assertNotNull(json)
@ -591,7 +549,7 @@ class CrossrefMappingTest {
@Test
def testLicenseEmbargoDateTime() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("publication_license_embargo_datetime.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/publication_license_embargo_datetime.json")).mkString
assertNotNull(json)
@ -614,7 +572,7 @@ class CrossrefMappingTest {
@Test
def testMultipleURLs() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("multiple_urls.json")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/multiple_urls.json")).mkString
assertNotNull(json)

View File

@ -1,11 +1,12 @@
package eu.dnetlib.doiboost.mag
package eu.dnetlib.dhp.doiboost.mag
import eu.dnetlib.doiboost.mag.{ConversionUtil, MagPapers, SparkProcessMAG}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, SparkSession}
import org.codehaus.jackson.map.ObjectMapper
import org.json4s.DefaultFormats
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory}
import java.sql.Timestamp
@ -47,7 +48,7 @@ class MAGMappingTest {
@Test
def buildInvertedIndexTest(): Unit = {
val json_input = Source.fromInputStream(getClass.getResourceAsStream("invertedIndex.json")).mkString
val json_input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/mag/invertedIndex.json")).mkString
val description = ConversionUtil.convertInvertedIndexString(json_input)
assertNotNull(description)
assertTrue(description.nonEmpty)
@ -71,7 +72,7 @@ class MAGMappingTest {
.appName(getClass.getSimpleName)
.config(conf)
.getOrCreate()
val path = getClass.getResource("magPapers.json").getPath
val path = getClass.getResource("/eu/dnetlib/doiboost/mag/magPapers.json").getPath
import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema
@ -101,7 +102,7 @@ class MAGMappingTest {
.appName(getClass.getSimpleName)
.config(conf)
.getOrCreate()
val path = getClass.getResource("duplicatedMagPapers.json").getPath
val path = getClass.getResource("/eu/dnetlib/doiboost/mag/duplicatedMagPapers.json").getPath
import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema

View File

@ -1,7 +1,8 @@
package eu.dnetlib.doiboost.orcid
package eu.dnetlib.dhp.doiboost.orcid
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.orcid._
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._
@ -10,9 +11,8 @@ import org.junit.jupiter.api.io.TempDir
import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.Path
import scala.io.Source
import scala.collection.JavaConversions._
import scala.io.Source
class MappingORCIDToOAFTest {
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
@ -20,7 +20,7 @@ class MappingORCIDToOAFTest {
@Test
def testExtractData():Unit ={
val json = Source.fromInputStream(getClass.getResourceAsStream("dataOutput")).mkString
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/orcid/dataOutput")).mkString
assertNotNull(json)
assertFalse(json.isEmpty)
json.lines.foreach(s => {

View File

@ -1,13 +1,13 @@
package eu.dnetlib.doiboost.uw
package eu.dnetlib.dhp.doiboost.uw
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.OpenAccessRoute
import eu.dnetlib.doiboost.uw.UnpayWallToOAF
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
import org.junit.jupiter.api.Assertions._
import org.slf4j.{Logger, LoggerFactory}
class UnpayWallMappingTest {
@ -18,7 +18,7 @@ class UnpayWallMappingTest {
@Test
def testMappingToOAF():Unit ={
val Ilist = Source.fromInputStream(getClass.getResourceAsStream("input.json")).mkString
val Ilist = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/uw/input.json")).mkString
var i:Int = 0
for (line <-Ilist.lines) {

View File

@ -0,0 +1 @@
#DHP Enrichment

View File

@ -0,0 +1,26 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu name="Documentation">
<item name="Link1 Collapsable" href="about.html" collapse="true">
<item name="item1" href="pubmed.html"/>
<item name="item2" href="datacite.html"/>
</item>
</menu>
<menu ref="reports"/>
</body>
</project>

View File

@ -88,7 +88,7 @@ public class CleanGraphSparkJob {
readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) GraphCleaningFunctions::cleanup, Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
.filter((FilterFunction<T>) GraphCleaningFunctions::filter)
.write()
.mode(SaveMode.Overwrite)

View File

@ -30,6 +30,11 @@ public class OafCleaner implements Serializable {
}
} else if (hasMapping(o, mapping)) {
mapping.get(o.getClass()).accept(o);
for (final Field f : getAllFields(o.getClass())) {
f.setAccessible(true);
final Object val = f.get(o);
navigate(val, mapping);
}
} else {
for (final Field f : getAllFields(o.getClass())) {
f.setAccessible(true);

View File

@ -8,3 +8,12 @@ CREATE VIEW IF NOT EXISTS ${hiveDbName}.result as
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.software s
union all
select id, originalid, dateofcollection, title, publisher, bestaccessright, datainfo, collectedfrom, pid, author, resulttype, language, country, subject, description, dateofacceptance, relevantdate, embargoenddate, resourcetype, context, externalreference, instance, measures from ${hiveDbName}.otherresearchproduct o;
ANALYZE TABLE ${hiveDbName}.datasource COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.organization COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.project COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.publication COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.dataset COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.otherresearchproduct COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.software COMPUTE STATISTICS;
ANALYZE TABLE ${hiveDbName}.relation COMPUTE STATISTICS;

View File

@ -292,7 +292,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Import table project</name>
<name>Import table relation</name>
<class>eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>

View File

@ -8,6 +8,15 @@
<name>unresolvedPath</name>
<description>the path of the unresolved Entities</description>
</property>
<property>
<name>targetPath</name>
<description>the target path after resolution</description>
</property>
<property>
<name>shouldResolveEntities</name>
<value>true</value>
<description>allows to activate/deactivate the resolution process over the entities</description>
</property>
</parameters>
<start to="ResolveRelations"/>
@ -36,11 +45,20 @@
<arg>--master</arg><arg>yarn</arg>
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
</spark>
<ok to="ResolveEntities"/>
<ok to="decision_resolveEntities"/>
<error to="Kill"/>
</action>
<decision name="decision_resolveEntities">
<switch>
<case to="copy_result">${wf:conf('shouldResolveEntities') eq false}</case>
<case to="ResolveEntities">${wf:conf('shouldResolveEntities') eq true}</case>
<default to="ResolveEntities"/>
</switch>
</decision>
<action name="ResolveEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -62,11 +80,91 @@
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--unresolvedPath</arg><arg>${unresolvedPath}</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
</spark>
<ok to="End"/>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<end name="End"/>
<fork name="copy_result">
<path start="copy_publication"/>
<path start="copy_dataset"/>
<path start="copy_otherresearchproduct"/>
<path start="copy_software"/>
</fork>
<action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/publication</arg>
<arg>${nameNode}/${targetPath}/publication</arg>
</distcp>
<ok to="copy_wait_result"/>
<error to="Kill"/>
</action>
<action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/dataset</arg>
<arg>${nameNode}/${targetPath}/dataset</arg>
</distcp>
<ok to="copy_wait_result"/>
<error to="Kill"/>
</action>
<action name="copy_otherresearchproduct">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/otherresearchproduct</arg>
<arg>${nameNode}/${targetPath}/otherresearchproduct</arg>
</distcp>
<ok to="copy_wait_result"/>
<error to="Kill"/>
</action>
<action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/software</arg>
<arg>${nameNode}/${targetPath}/software</arg>
</distcp>
<ok to="copy_wait_result"/>
<error to="Kill"/>
</action>
<join name="copy_wait_result" to="copy_entities"/>
<fork name="copy_entities">
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasource"/>
</fork>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/organization</arg>
<arg>${nameNode}/${targetPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/project</arg>
<arg>${nameNode}/${targetPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasource">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${graphBasePath}/datasource</arg>
<arg>${nameNode}/${targetPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="End"/>
<end name="End"/>
</workflow-app>

View File

@ -2,5 +2,6 @@
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"u", "paramLongName":"unresolvedPath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true}
{"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target path", "paramRequired": true}
]

View File

@ -1,5 +1,6 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true}
{"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target path", "paramRequired": true}
]

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, TypedColumn}
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}

View File

@ -2,13 +2,12 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.SparkApplyHostedByMapToResult.{applyHBtoPubs, getClass}
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Datasource, Publication}
import eu.dnetlib.dhp.schema.oaf.Datasource
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory}
@ -52,18 +51,18 @@ object SparkApplyHostedByMapToDatasource {
val mapper = new ObjectMapper()
val dats : Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource")
val dats: Dataset[Datasource] = spark.read.textFile(graphPath + "/datasource")
.map(r => mapper.readValue(r, classOf[Datasource]))
val pinfo : Dataset[EntityInfo] = Aggregators.datasourceToSingleId( spark.read.textFile(preparedInfoPath)
val pinfo: Dataset[EntityInfo] = Aggregators.datasourceToSingleId(spark.read.textFile(preparedInfoPath)
.map(ei => mapper.readValue(ei, classOf[EntityInfo])))
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
applyHBtoDats(pinfo, dats).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
spark.read.textFile(outputPath)
.write
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.text(graphPath + "/datasource")
}

View File

@ -5,16 +5,14 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.schema.oaf.{Datasource, Instance, OpenAccessRoute, Publication}
import eu.dnetlib.dhp.schema.oaf.{Instance, OpenAccessRoute, Publication}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkApplyHostedByMapToResult {
def applyHBtoPubs(join: Dataset[EntityInfo], pubs: Dataset[Publication]) = {
@ -25,7 +23,7 @@ object SparkApplyHostedByMapToResult {
val ei: EntityInfo = t2._2
val i = p.getInstance().asScala
if (i.size == 1) {
val inst: Instance = i(0)
val inst: Instance = i.head
inst.getHostedby.setKey(ei.getHostedById)
inst.getHostedby.setValue(ei.getName)
if (ei.getOpenAccess) {
@ -39,6 +37,7 @@ object SparkApplyHostedByMapToResult {
p
})(Encoders.bean(classOf[Publication]))
}
def main(args: Array[String]): Unit = {
@ -67,18 +66,18 @@ object SparkApplyHostedByMapToResult {
implicit val mapEncoderEinfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
val mapper = new ObjectMapper()
val pubs : Dataset[Publication] = spark.read.textFile(graphPath + "/publication")
val pubs: Dataset[Publication] = spark.read.textFile(graphPath + "/publication")
.map(r => mapper.readValue(r, classOf[Publication]))
val pinfo : Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath)
val pinfo: Dataset[EntityInfo] = spark.read.textFile(preparedInfoPath)
.map(ei => mapper.readValue(ei, classOf[EntityInfo]))
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression","gzip").json(outputPath)
applyHBtoPubs(pinfo, pubs).write.mode(SaveMode.Overwrite).option("compression", "gzip").json(outputPath)
spark.read.textFile(outputPath)
.write
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.text(graphPath + "/publication")
}

View File

@ -3,61 +3,58 @@ package eu.dnetlib.dhp.oa.graph.hostedbymap
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.EntityInfo
import eu.dnetlib.dhp.schema.oaf.{Journal, Publication}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
object SparkPrepareHostedByInfoToApply {
implicit val mapEncoderPInfo: Encoder[EntityInfo] = Encoders.bean(classOf[EntityInfo])
def getList(id: String, j: Journal, name: String ) : List[EntityInfo] = {
var lst:List[EntityInfo] = List()
def getList(id: String, j: Journal, name: String): List[EntityInfo] = {
var lst: List[EntityInfo] = List()
if (j.getIssnLinking != null && !j.getIssnLinking.equals("")){
if (j.getIssnLinking != null && !j.getIssnLinking.equals("")) {
lst = EntityInfo.newInstance(id, j.getIssnLinking, name) :: lst
}
if (j.getIssnOnline != null && !j.getIssnOnline.equals("")){
if (j.getIssnOnline != null && !j.getIssnOnline.equals("")) {
lst = EntityInfo.newInstance(id, j.getIssnOnline, name) :: lst
}
if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")){
if (j.getIssnPrinted != null && !j.getIssnPrinted.equals("")) {
lst = EntityInfo.newInstance(id, j.getIssnPrinted, name) :: lst
}
lst
}
def prepareResultInfo(spark:SparkSession, publicationPath:String) : Dataset[EntityInfo] = {
def prepareResultInfo(spark: SparkSession, publicationPath: String): Dataset[EntityInfo] = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.bean(classOf[Publication])
val mapper = new ObjectMapper()
val dd : Dataset[Publication] = spark.read.textFile(publicationPath)
val dd: Dataset[Publication] = spark.read.textFile(publicationPath)
.map(r => mapper.readValue(r, classOf[Publication]))
dd.filter(p => p.getJournal != null ).flatMap(p => getList(p.getId, p.getJournal, ""))
dd.filter(p => p.getJournal != null).flatMap(p => getList(p.getId, p.getJournal, ""))
}
def toEntityInfo(input:String): EntityInfo = {
def toEntityInfo(input: String): EntityInfo = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
val c: Map[String, HostedByItemType] = json.extract[Map[String, HostedByItemType]]
toEntityItem(c.keys.head, c.values.head)
}
def toEntityItem(journal_id: String , hbi: HostedByItemType): EntityInfo = {
def toEntityItem(journal_id: String, hbi: HostedByItemType): EntityInfo = {
EntityInfo.newInstance(hbi.id, journal_id, hbi.officialname, hbi.openAccess)
@ -67,7 +64,7 @@ object SparkPrepareHostedByInfoToApply {
Aggregators.resultToSingleId(res.joinWith(hbm, res.col("journalId").equalTo(hbm.col("journalId")), "left")
.map(t2 => {
val res: EntityInfo = t2._1
if(t2._2 != null ){
if (t2._2 != null) {
val ds = t2._2
res.setHostedById(ds.getId)
res.setOpenAccess(ds.getOpenAccess)
@ -107,10 +104,10 @@ object SparkPrepareHostedByInfoToApply {
//STEP1: read the hostedbymap and transform it in EntityInfo
val hostedByInfo:Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
val hostedByInfo: Dataset[EntityInfo] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath)).map(toEntityInfo)
//STEP2: create association (publication, issn), (publication, eissn), (publication, lissn)
val resultInfoDataset:Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication")
val resultInfoDataset: Dataset[EntityInfo] = prepareResultInfo(spark, graphPath + "/publication")
//STEP3: left join resultInfo with hostedByInfo on journal_id. Reduction of all the results with the same id in just
//one entry (one result could be associated to issn and eissn and so possivly matching more than once against the map)

View File

@ -1,29 +1,27 @@
package eu.dnetlib.dhp.oa.graph.hostedbymap
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.hostedbymap.model.{DOAJModel, UnibiGoldModel}
import eu.dnetlib.dhp.schema.oaf.Datasource
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.json4s.DefaultFormats
import org.slf4j.{Logger, LoggerFactory}
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import java.io.PrintWriter
import org.apache.hadoop.io.compress.GzipCodec
object SparkProduceHostedByMap {
implicit val tupleForJoinEncoder: Encoder[(String, HostedByItemType)] = Encoders.tuple(Encoders.STRING, Encoders.product[HostedByItemType])
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)) : HostedByItemType = {
def toHostedByItemType(input: ((HostedByInfo, HostedByInfo), HostedByInfo)): HostedByItemType = {
val openaire: HostedByInfo = input._1._1
val doaj: HostedByInfo = input._1._2
val gold: HostedByInfo = input._2
@ -44,7 +42,7 @@ object SparkProduceHostedByMap {
implicit val formats = org.json4s.DefaultFormats
val map: Map [String, HostedByItemType] = Map (input._1 -> input._2 )
val map: Map[String, HostedByItemType] = Map(input._1 -> input._2)
Serialization.write(map)
@ -52,34 +50,33 @@ object SparkProduceHostedByMap {
}
def getHostedByItemType(id:String, officialname: String, issn:String, eissn:String, issnl:String, oa:Boolean): HostedByItemType = {
if(issn != null){
if(eissn != null){
if(issnl != null){
HostedByItemType(id, officialname, issn, eissn, issnl , oa)
}else{
HostedByItemType(id, officialname, issn, eissn, "" , oa)
def getHostedByItemType(id: String, officialname: String, issn: String, eissn: String, issnl: String, oa: Boolean): HostedByItemType = {
if (issn != null) {
if (eissn != null) {
if (issnl != null) {
HostedByItemType(id, officialname, issn, eissn, issnl, oa)
} else {
HostedByItemType(id, officialname, issn, eissn, "", oa)
}
}else{
if(issnl != null){
HostedByItemType(id, officialname, issn, "", issnl , oa)
}else{
HostedByItemType(id, officialname, issn, "", "" , oa)
} else {
if (issnl != null) {
HostedByItemType(id, officialname, issn, "", issnl, oa)
} else {
HostedByItemType(id, officialname, issn, "", "", oa)
}
}
}else{
if(eissn != null){
if(issnl != null){
HostedByItemType(id, officialname, "", eissn, issnl , oa)
}else{
HostedByItemType(id, officialname, "", eissn, "" , oa)
} else {
if (eissn != null) {
if (issnl != null) {
HostedByItemType(id, officialname, "", eissn, issnl, oa)
} else {
HostedByItemType(id, officialname, "", eissn, "", oa)
}
}else{
if(issnl != null){
HostedByItemType(id, officialname, "", "", issnl , oa)
}else{
HostedByItemType("", "", "", "", "" , oa)
} else {
if (issnl != null) {
HostedByItemType(id, officialname, "", "", issnl, oa)
} else {
HostedByItemType("", "", "", "", "", oa)
}
}
}
@ -90,10 +87,10 @@ object SparkProduceHostedByMap {
return getHostedByItemType(dats.getId, dats.getOfficialname.getValue, dats.getJournal.getIssnPrinted, dats.getJournal.getIssnOnline, dats.getJournal.getIssnLinking, false)
}
HostedByItemType("","","","","",false)
HostedByItemType("", "", "", "", "", false)
}
def oaHostedByDataset(spark:SparkSession, datasourcePath : String) : Dataset[HostedByItemType] = {
def oaHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
import spark.implicits._
@ -102,10 +99,10 @@ object SparkProduceHostedByMap {
implicit var encoderD = Encoders.kryo[Datasource]
val dd : Dataset[Datasource] = spark.read.textFile(datasourcePath)
val dd: Dataset[Datasource] = spark.read.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[Datasource]))
dd.map{ddt => oaToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals("")))
dd.map { ddt => oaToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
}
@ -115,17 +112,17 @@ object SparkProduceHostedByMap {
}
def goldHostedByDataset(spark:SparkSession, datasourcePath:String) : Dataset[HostedByItemType] = {
def goldHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
import spark.implicits._
implicit val mapEncoderUnibi: Encoder[UnibiGoldModel] = Encoders.kryo[UnibiGoldModel]
val mapper = new ObjectMapper()
val dd : Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath)
val dd: Dataset[UnibiGoldModel] = spark.read.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[UnibiGoldModel]))
dd.map{ddt => goldToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals("")))
dd.map { ddt => goldToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
}
@ -134,41 +131,40 @@ object SparkProduceHostedByMap {
return getHostedByItemType(Constants.DOAJ, doaj.getJournalTitle, doaj.getIssn, doaj.getEissn, "", true)
}
def doajHostedByDataset(spark:SparkSession, datasourcePath:String) : Dataset[HostedByItemType] = {
def doajHostedByDataset(spark: SparkSession, datasourcePath: String): Dataset[HostedByItemType] = {
import spark.implicits._
implicit val mapEncoderDOAJ: Encoder[DOAJModel] = Encoders.kryo[DOAJModel]
val mapper = new ObjectMapper()
val dd : Dataset[DOAJModel] = spark.read.textFile(datasourcePath)
val dd: Dataset[DOAJModel] = spark.read.textFile(datasourcePath)
.map(r => mapper.readValue(r, classOf[DOAJModel]))
dd.map{ddt => doajToHostedbyItemType(ddt)}.filter(hb => !(hb.id.equals("")))
dd.map { ddt => doajToHostedbyItemType(ddt) }.filter(hb => !(hb.id.equals("")))
}
def toList(input: HostedByItemType): List[(String, HostedByItemType)] = {
var lst : List[(String, HostedByItemType)] = List()
if(!input.issn.equals("")){
var lst: List[(String, HostedByItemType)] = List()
if (!input.issn.equals("")) {
lst = (input.issn, input) :: lst
}
if(!input.eissn.equals("")){
if (!input.eissn.equals("")) {
lst = (input.eissn, input) :: lst
}
if(!input.lissn.equals("")){
if (!input.lissn.equals("")) {
lst = (input.lissn, input) :: lst
}
lst
}
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode : String):Unit = {
def writeToHDFS(input: Array[String], outputPath: String, hdfsNameNode: String): Unit = {
val conf = new Configuration()
conf.set("fs.defaultFS", hdfsNameNode)
val fs= FileSystem.get(conf)
val fs = FileSystem.get(conf)
val output = fs.create(new Path(outputPath))
val writer = new PrintWriter(output)
try {
@ -182,7 +178,6 @@ object SparkProduceHostedByMap {
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
@ -213,7 +208,7 @@ object SparkProduceHostedByMap {
.union(doajHostedByDataset(spark, workingDirPath + "/doaj.json"))
.flatMap(hbi => toList(hbi))).filter(hbi => hbi._2.id.startsWith("10|"))
.map(hbi => toHostedByMap(hbi))(Encoders.STRING)
.rdd.saveAsTextFile(outputPath , classOf[GzipCodec])
.rdd.saveAsTextFile(outputPath, classOf[GzipCodec])
}

View File

@ -4,17 +4,11 @@ import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.schema.common.ModelSupport
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.io.IOUtils
import org.apache.commons.lang3.StringUtils
import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.HttpClients
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory
import scala.collection.JavaConverters._
import scala.io.Source
@ -59,7 +53,7 @@ object CopyHdfsOafSparkApplication {
if (validPaths.nonEmpty) {
val oaf = spark.read.load(validPaths: _*).as[Oaf]
val mapper = new ObjectMapper()
val l =ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList
val l = ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList
l.foreach(
e =>
oaf.filter(o => o.getClass.getSimpleName.equalsIgnoreCase(e))

View File

@ -2,9 +2,8 @@ package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.schema.common.EntityType
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
@ -14,7 +13,7 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkResolveEntities {
val mapper = new ObjectMapper()
val entities = List(EntityType.dataset,EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
val entities = List(EntityType.dataset, EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
@ -36,25 +35,19 @@ object SparkResolveEntities {
val unresolvedPath = parser.get("unresolvedPath")
log.info(s"unresolvedPath -> $unresolvedPath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.mkdirs(new Path(workingPath))
resolveEntities(spark, workingPath, unresolvedPath)
generateResolvedEntities(spark, workingPath, graphBasePath)
// TO BE conservative we keep the original entities in the working dir
// and save the resolved entities on the graphBasePath
//In future these lines of code should be removed
entities.foreach {
e =>
fs.rename(new Path(s"$graphBasePath/$e"), new Path(s"$workingPath/${e}_old"))
fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e"))
generateResolvedEntities(spark, workingPath, graphBasePath, targetPath)
}
}
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._
@ -71,37 +64,42 @@ def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: St
}
def deserializeObject(input:String, entity:EntityType ) :Result = {
def deserializeObject(input: String, entity: EntityType): Result = {
entity match {
case EntityType.publication => mapper.readValue(input, classOf[Publication])
case EntityType.dataset => mapper.readValue(input, classOf[OafDataset])
case EntityType.software=> mapper.readValue(input, classOf[Software])
case EntityType.otherresearchproduct=> mapper.readValue(input, classOf[OtherResearchProduct])
case EntityType.software => mapper.readValue(input, classOf[Software])
case EntityType.otherresearchproduct => mapper.readValue(input, classOf[OtherResearchProduct])
}
}
def generateResolvedEntities(spark:SparkSession, workingPath: String, graphBasePath:String) = {
def generateResolvedEntities(spark: SparkSession, workingPath: String, graphBasePath: String, targetPath: String) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._
val re:Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result]
val re: Dataset[(String, Result)] = spark.read.load(s"$workingPath/resolvedEntities").as[Result].map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
entities.foreach {
e =>
e => {
val currentEntityDataset: Dataset[(String, Result)] = spark.read.text(s"$graphBasePath/$e").as[String].map(s => deserializeObject(s, e)).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
currentEntityDataset.joinWith(re, currentEntityDataset("_1").equalTo(re("_1")), "left").map(k => {
val a = k._1
val b = k._2
if (b == null)
a._2
else {
a._2.mergeFrom(b._2)
a._2
}
}).map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$targetPath/$e")
}
spark.read.text(s"$graphBasePath/$e").as[String]
.map(s => deserializeObject(s, e))
.union(re)
.groupByKey(_.getId)
.reduceGroups {
(x, y) =>
x.mergeFrom(y)
x
}.map(_._2)
.filter(r => r.getClass.getSimpleName.toLowerCase != "result")
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingPath/resolvedGraph/$e")
}
}
}

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileSystem, Path}
@ -35,6 +35,9 @@ object SparkResolveRelation {
val workingPath = parser.get("workingPath")
log.info(s"workingPath -> $workingPath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
import spark.implicits._
@ -80,20 +83,13 @@ object SparkResolveRelation {
.mode(SaveMode.Overwrite)
.save(s"$workingPath/relation_resolved")
// TO BE conservative we keep the original relation in the working dir
// and save the relation resolved on the graphBasePath
//In future this two line of code should be removed
fs.rename(new Path(s"$graphBasePath/relation"), new Path(s"$workingPath/relation"))
spark.read.load(s"$workingPath/relation_resolved").as[Relation]
.filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved"))
.map(r => mapper.writeValueAsString(r))
.write
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.text(s"$graphBasePath/relation")
.text(s"$targetPath/relation")
}
def extractInstanceCF(input: String): List[(String, String)] = {

View File

@ -18,7 +18,6 @@ object SparkDataciteToOAF {
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
import spark.implicits._
val sc = spark.sparkContext

View File

@ -2,7 +2,7 @@ package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.oaf.Result
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf
@ -29,13 +29,13 @@ object SparkConvertDatasetToJsonRDD {
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val resultObject = List("publication","dataset","software", "otherResearchProduct")
val resultObject = List("publication", "dataset", "software", "otherResearchProduct")
val mapper = new ObjectMapper()
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
resultObject.foreach{item =>
spark.read.load(s"$sourcePath/$item").as[Result].map(r=> mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
resultObject.foreach { item =>
spark.read.load(s"$sourcePath/$item").as[Result].map(r => mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
}
}

View File

@ -5,10 +5,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import org.apache.hadoop.io.compress._
object SparkConvertObjectToJson {
@ -32,8 +32,8 @@ object SparkConvertObjectToJson {
log.info(s"objectType -> $objectType")
implicit val scholixEncoder :Encoder[Scholix]= Encoders.kryo[Scholix]
implicit val summaryEncoder :Encoder[ScholixSummary]= Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
val mapper = new ObjectMapper

View File

@ -2,11 +2,12 @@ package eu.dnetlib.dhp.sx.graph
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Software,Dataset => OafDataset}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkConvertRDDtoDataset {
def main(args: Array[String]): Unit = {
@ -39,31 +40,31 @@ object SparkConvertRDDtoDataset {
log.info("Converting dataset")
val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset]))
val rddDataset = spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset]))
spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset")
log.info("Converting publication")
val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication]))
val rddPublication = spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication]))
spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication")
log.info("Converting software")
val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software]))
val rddSoftware = spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software]))
spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software")
log.info("Converting otherresearchproduct")
val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
val rddOtherResearchProduct = spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct")
log.info("Converting Relation")
val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin")
val relationSemanticFilter = List("cites", "iscitedby", "merges", "ismergedin")
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation")
val rddRelation = spark.sparkContext.textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation]))
.filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")

View File

@ -1,14 +1,12 @@
package eu.dnetlib.dhp.sx.graph
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset,_}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
object SparkCreateInputGraph {
def main(args: Array[String]): Unit = {
@ -41,16 +39,13 @@ object SparkCreateInputGraph {
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
val oafDs:Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
val oafDs: Dataset[Oaf] = spark.read.load(s"$sourcePath/*").as[Oaf]
log.info("Extract Publication")
@ -70,12 +65,12 @@ object SparkCreateInputGraph {
resultObject.foreach { r =>
log.info(s"Make ${r._1} unique")
makeDatasetUnique(s"$targetPath/extracted/${r._1}",s"$targetPath/preprocess/${r._1}",spark, r._2)
makeDatasetUnique(s"$targetPath/extracted/${r._1}", s"$targetPath/preprocess/${r._1}", spark, r._2)
}
}
def extractEntities[T <: Oaf ](oafDs:Dataset[Oaf], targetPath:String, clazz:Class[T], log:Logger) :Unit = {
def extractEntities[T <: Oaf](oafDs: Dataset[Oaf], targetPath: String, clazz: Class[T], log: Logger): Unit = {
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
log.info(s"Extract ${clazz.getSimpleName}")
@ -83,14 +78,14 @@ object SparkCreateInputGraph {
}
def makeDatasetUnique[T <: Result ](sourcePath:String, targetPath:String, spark:SparkSession, clazz:Class[T]) :Unit = {
def makeDatasetUnique[T <: Result](sourcePath: String, targetPath: String, spark: SparkSession, clazz: Class[T]): Unit = {
import spark.implicits._
implicit val resEncoder: Encoder[T] = Encoders.kryo(clazz)
val ds:Dataset[T] = spark.read.load(sourcePath).as[T]
val ds: Dataset[T] = spark.read.load(sourcePath).as[T]
ds.groupByKey(_.getId).reduceGroups{(x,y) =>
ds.groupByKey(_.getId).reduceGroups { (x, y) =>
x.mergeFrom(y)
x
}.map(_._2).write.mode(SaveMode.Overwrite).save(targetPath)

View File

@ -9,7 +9,7 @@ import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.RelatedEntities
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
object SparkCreateScholix {
@ -42,7 +42,7 @@ object SparkCreateScholix {
val relationDS: Dataset[(String, Relation)] = spark.read.load(relationPath).as[Relation]
.filter(r => (r.getDataInfo== null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
val summaryDS: Dataset[(String, ScholixSummary)] = spark.read.load(summaryPath).as[ScholixSummary]
@ -51,54 +51,54 @@ object SparkCreateScholix {
relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Relation), (String, ScholixSummary)) =>
if (input._1!= null && input._2!= null) {
if (input._1 != null && input._2 != null) {
val rel: Relation = input._1._2
val source: ScholixSummary = input._2._2
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
}
else null
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
.filter(r => r!= null)
.filter(r => r != null)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source")
val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
scholixSource.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
if (input._2== null) {
if (input._2 == null) {
null
} else {
val s: Scholix = input._1._2
val target: ScholixSummary = input._2._2
ScholixUtils.generateCompleteScholix(s, target)
}
}.filter(s => s!= null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
}.filter(s => s != null).write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_one_verse")
val scholix_o_v: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix_one_verse").as[Scholix]
scholix_o_v.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s))).as[Scholix]
.map(s=> (s.getIdentifier,s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.groupByKey(_._1)
.agg(ScholixUtils.scholixAggregator.toColumn)
.map(s => s._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix")
val scholix_final:Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
val scholix_final: Dataset[Scholix] = spark.read.load(s"$targetPath/scholix").as[Scholix]
val stats:Dataset[(String,String,Long)]= scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String,String,Long)]
val stats: Dataset[(String, String, Long)] = scholix_final.map(s => (s.getSource.getDnetIdentifier, s.getTarget.getObjectType)).groupBy("_1", "_2").agg(count("_1")).as[(String, String, Long)]
stats
.map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0 ))
.map(s => RelatedEntities(s._1, if ("dataset".equalsIgnoreCase(s._2)) s._3 else 0, if ("publication".equalsIgnoreCase(s._2)) s._3 else 0))
.groupByKey(_.id)
.reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset+b.relatedDataset, a.relatedPublication+b.relatedPublication))
.reduceGroups((a, b) => RelatedEntities(a.id, a.relatedDataset + b.relatedDataset, a.relatedPublication + b.relatedPublication))
.map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/related_entities")
val relatedEntitiesDS:Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication>0 || r.relatedDataset > 0)
val relatedEntitiesDS: Dataset[RelatedEntities] = spark.read.load(s"$targetPath/related_entities").as[RelatedEntities].filter(r => r.relatedPublication > 0 || r.relatedDataset > 0)
relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map{i =>
relatedEntitiesDS.joinWith(summaryDS, relatedEntitiesDS("id").equalTo(summaryDS("_1")), "inner").map { i =>
val re = i._1
val sum = i._2._2

View File

@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
object SparkCreateSummaryObject {
@ -28,15 +28,15 @@ object SparkCreateSummaryObject {
val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath")
implicit val resultEncoder:Encoder[Result] = Encoders.kryo[Result]
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result]
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val summaryEncoder:Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
val ds:Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r=>r.getDataInfo== null || r.getDataInfo.getDeletedbyinference== false)
val ds: Dataset[Result] = spark.read.load(s"$sourcePath/*").as[Result].filter(r => r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false)
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s!= null).write.mode(SaveMode.Overwrite).save(targetPath)
ds.repartition(6000).map(r => ScholixUtils.resultToSummary(r)).filter(s => s != null).write.mode(SaveMode.Overwrite).save(targetPath)
}

View File

@ -5,6 +5,7 @@ import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import java.util.regex.Pattern
import scala.language.postfixOps
import scala.xml.{Elem, Node, XML}

View File

@ -2,11 +2,11 @@ package eu.dnetlib.dhp.sx.graph.pangaea
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import scala.io.Source
object SparkGeneratePanagaeaDataset {
@ -28,16 +28,16 @@ object SparkGeneratePanagaeaDataset {
parser.getObjectMap.asScala.foreach(s => logger.info(s"${s._1} -> ${s._2}"))
logger.info("Converting sequential file into Dataset")
val sc:SparkContext = spark.sparkContext
val sc: SparkContext = spark.sparkContext
val workingPath:String = parser.get("workingPath")
val workingPath: String = parser.get("workingPath")
implicit val pangaeaEncoders: Encoder[PangaeaDataModel] = Encoders.kryo[PangaeaDataModel]
val inputRDD:RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
val inputRDD: RDD[PangaeaDataModel] = sc.textFile(s"$workingPath/update").map(s => PangaeaUtils.toDataset(s))
spark.createDataset(inputRDD).as[PangaeaDataModel]
.map(s => (s.identifier,s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
.map(s => (s.identifier, s))(Encoders.tuple(Encoders.STRING, pangaeaEncoders))
.groupByKey(_._1)(Encoders.STRING)
.agg(PangaeaUtils.getDatasetAggregator().toColumn)
.map(s => s._2)
@ -46,7 +46,4 @@ object SparkGeneratePanagaeaDataset {
}
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.scholix._
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
@ -10,23 +9,22 @@ import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._
import scala.io.Source
import scala.language.postfixOps
object ScholixUtils {
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
val DATE_RELATION_KEY:String = "RelationDate"
case class RelationVocabulary(original:String, inverse:String){}
val DATE_RELATION_KEY: String = "RelationDate"
case class RelatedEntities(id:String, relatedDataset:Long, relatedPublication:Long){}
case class RelationVocabulary(original: String, inverse: String) {}
val relations:Map[String, RelationVocabulary] = {
val input =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
val relations: Map[String, RelationVocabulary] = {
val input = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/relations.json")).mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
@ -35,12 +33,12 @@ object ScholixUtils {
}
def extractRelationDate(relation: Relation):String = {
def extractRelationDate(relation: Relation): String = {
if (relation.getProperties== null || !relation.getProperties.isEmpty)
if (relation.getProperties == null || !relation.getProperties.isEmpty)
null
else {
val date =relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
val date = relation.getProperties.asScala.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)).map(p => p.getValue)
if (date.isDefined)
date.get
else
@ -48,9 +46,9 @@ object ScholixUtils {
}
}
def extractRelationDate(summary: ScholixSummary):String = {
def extractRelationDate(summary: ScholixSummary): String = {
if(summary.getDate== null || summary.getDate.isEmpty)
if (summary.getDate == null || summary.getDate.isEmpty)
null
else {
summary.getDate.get(0)
@ -59,15 +57,14 @@ object ScholixUtils {
}
def inverseRelationShip(rel:ScholixRelationship):ScholixRelationship = {
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
}
val statsAggregator:Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String,String, Long), RelatedEntities, RelatedEntities] with Serializable {
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
override def zero: RelatedEntities = null
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
@ -78,15 +75,14 @@ object ScholixUtils {
if (b == null)
RelatedEntities(a._1, relatedDataset, relatedPublication)
else
RelatedEntities(a._1,b.relatedDataset+ relatedDataset, b.relatedPublication+ relatedPublication )
RelatedEntities(a._1, b.relatedDataset + relatedDataset, b.relatedPublication + relatedPublication)
}
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
if (b1!= null && b2!= null)
RelatedEntities(b1.id, b1.relatedDataset+ b2.relatedDataset, b1.relatedPublication+ b2.relatedPublication)
if (b1 != null && b2 != null)
RelatedEntities(b1.id, b1.relatedDataset + b2.relatedDataset, b1.relatedPublication + b2.relatedPublication)
else
if (b1!= null)
else if (b1 != null)
b1
else
b2
@ -104,8 +100,8 @@ object ScholixUtils {
override def zero: Scholix = null
def scholix_complete(s:Scholix):Boolean ={
if (s== null || s.getIdentifier==null) {
def scholix_complete(s: Scholix): Boolean = {
if (s == null || s.getIdentifier == null) {
false
} else if (s.getSource == null || s.getTarget == null) {
false
@ -132,7 +128,7 @@ object ScholixUtils {
}
def createInverseScholixRelation(scholix: Scholix):Scholix = {
def createInverseScholixRelation(scholix: Scholix): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
s.setPublisher(scholix.getPublisher)
@ -144,34 +140,33 @@ object ScholixUtils {
s
}
def extractCollectedFrom(summary:ScholixSummary): List[ScholixEntityId] = {
if (summary.getDatasources!= null && !summary.getDatasources.isEmpty) {
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map{
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map {
d => new ScholixEntityId(d.getDatasourceName, List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava)
}(collection.breakOut)
l
} else List()
}
def extractCollectedFrom(relation: Relation) : List[ScholixEntityId] = {
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map {
c =>
new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA,null)).asJava)
new ScholixEntityId(c.getValue, List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava)
}(collection breakOut)
l
} else List()
}
def generateCompleteScholix(scholix: Scholix, target:ScholixSummary): Scholix = {
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
s.setPublisher(scholix.getPublisher)
@ -192,29 +187,28 @@ object ScholixUtils {
r.setObjectType(summaryObject.getTypology.toString)
r.setObjectSubType(summaryObject.getSubType)
if (summaryObject.getTitle!= null && !summaryObject.getTitle.isEmpty)
if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty)
r.setTitle(summaryObject.getTitle.get(0))
if (summaryObject.getAuthor!= null && !summaryObject.getAuthor.isEmpty){
val l:List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a,null)).toList
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] = summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}
if (summaryObject.getDate!= null && !summaryObject.getDate.isEmpty)
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
r.setPublicationDate(summaryObject.getDate.get(0))
if (summaryObject.getPublisher!= null && !summaryObject.getPublisher.isEmpty)
{
val plist:List[ScholixEntityId] =summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
val plist: List[ScholixEntityId] = summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (plist.nonEmpty)
r.setPublisher(plist.asJava)
}
if (summaryObject.getDatasources!= null && !summaryObject.getDatasources.isEmpty) {
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
val l:List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala.map(c => new ScholixCollectedFrom(
new ScholixEntityId(c.getDatasourceName, List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava)
, "collected", "complete"
@ -228,12 +222,9 @@ object ScholixUtils {
}
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
def scholixFromSource(relation:Relation, source:ScholixSummary):Scholix = {
if (relation== null || source== null)
if (relation == null || source == null)
return null
val s = new Scholix
@ -253,9 +244,9 @@ object ScholixUtils {
s.setPublicationDate(d)
if (source.getPublisher!= null && !source.getPublisher.isEmpty) {
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
val l: List[ScholixEntityId] = source.getPublisher.asScala
.map{
.map {
p =>
new ScholixEntityId(p, null)
}(collection.breakOut)
@ -265,7 +256,7 @@ object ScholixUtils {
}
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation== null)
if (semanticRelation == null)
return null
s.setRelationship(new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse))
s.setSource(generateScholixResourceFromSummary(source))
@ -274,8 +265,8 @@ object ScholixUtils {
}
def findURLForPID(pidValue:List[StructuredProperty], urls:List[String]):List[(StructuredProperty, String)] = {
pidValue.map{
def findURLForPID(pidValue: List[StructuredProperty], urls: List[String]): List[(StructuredProperty, String)] = {
pidValue.map {
p =>
val pv = p.getValue
@ -285,67 +276,67 @@ object ScholixUtils {
}
def extractTypedIdentifierFromInstance(r:Result):List[ScholixIdentifier] = {
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
if (r.getInstance() == null || r.getInstance().isEmpty)
return List()
r.getInstance().asScala.filter(i => i.getUrl!= null && !i.getUrl.isEmpty)
.filter(i => i.getPid!= null && i.getUrl != null)
r.getInstance().asScala.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
.filter(i => i.getPid != null && i.getUrl != null)
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList
}
def resultToSummary(r:Result):ScholixSummary = {
def resultToSummary(r: Result): ScholixSummary = {
val s = new ScholixSummary
s.setId(r.getId)
if (r.getPid == null || r.getPid.isEmpty)
return null
val persistentIdentifiers:List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r)
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r)
if (persistentIdentifiers.isEmpty)
return null
s.setLocalIdentifier(persistentIdentifiers.asJava)
if (r.isInstanceOf[Publication] )
if (r.isInstanceOf[Publication])
s.setTypology(Typology.publication)
else
s.setTypology(Typology.dataset)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
if (r.getTitle!= null && r.getTitle.asScala.nonEmpty) {
val titles:List[String] =r.getTitle.asScala.map(t => t.getValue)(collection breakOut)
if (r.getTitle != null && r.getTitle.asScala.nonEmpty) {
val titles: List[String] = r.getTitle.asScala.map(t => t.getValue)(collection breakOut)
if (titles.nonEmpty)
s.setTitle(titles.asJava)
else
return null
}
if(r.getAuthor!= null && !r.getAuthor.isEmpty) {
val authors:List[String] = r.getAuthor.asScala.map(a=> a.getFullname)(collection breakOut)
if (r.getAuthor != null && !r.getAuthor.isEmpty) {
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname)(collection breakOut)
if (authors nonEmpty)
s.setAuthor(authors.asJava)
}
if (r.getInstance() != null) {
val dt:List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut)
val dt: List[String] = r.getInstance().asScala.filter(i => i.getDateofacceptance != null).map(i => i.getDateofacceptance.getValue)(collection.breakOut)
if (dt.nonEmpty)
s.setDate(dt.distinct.asJava)
}
if (r.getDescription!= null && !r.getDescription.isEmpty) {
val d = r.getDescription.asScala.find(f => f!= null && f.getValue!=null)
if (r.getDescription != null && !r.getDescription.isEmpty) {
val d = r.getDescription.asScala.find(f => f != null && f.getValue != null)
if (d.isDefined)
s.setDescription(d.get.getValue)
}
if (r.getSubject!= null && !r.getSubject.isEmpty) {
val subjects:List[SchemeValue] =r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut)
if (r.getSubject != null && !r.getSubject.isEmpty) {
val subjects: List[SchemeValue] = r.getSubject.asScala.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))(collection breakOut)
if (subjects.nonEmpty)
s.setSubject(subjects.asJava)
}
if (r.getPublisher!= null)
if (r.getPublisher != null)
s.setPublisher(List(r.getPublisher.getValue).asJava)
if (r.getCollectedfrom!= null && !r.getCollectedfrom.isEmpty) {
val cf:List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut)
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))(collection breakOut)
if (cf.nonEmpty)
s.setDatasources(cf.distinct.asJava)
}

View File

@ -12,6 +12,8 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.MappableBlock;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
@ -66,9 +68,59 @@ public class GraphCleaningFunctionsTest {
Relation r_out = OafCleaner.apply(r_in, mapping);
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_out.getRelClass()));
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r_out.getSubRelType()));
assertEquals("iis", r_out.getDataInfo().getProvenanceaction().getClassid());
assertEquals("Inferred by OpenAIRE", r_out.getDataInfo().getProvenanceaction().getClassname());
}
}
@Test
void testFilter_false() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(false, GraphCleaningFunctions.filter(p_in));
}
@Test
void testFilter_true() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(true, GraphCleaningFunctions.filter(p_in));
}
@Test
void testFilter_missing_invisible() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
assertEquals(true, GraphCleaningFunctions.filter(p_in));
}
@Test
void testCleaning() throws Exception {
@ -99,6 +151,12 @@ public class GraphCleaningFunctionsTest {
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
assertEquals("0027", p_out.getInstance().get(1).getInstancetype().getClassid());
assertEquals("Model", p_out.getInstance().get(1).getInstancetype().getClassname());
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassid());
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassname());
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
@ -112,7 +170,7 @@ public class GraphCleaningFunctionsTest {
List<Instance> poi = p_out.getInstance();
assertNotNull(poi);
assertEquals(1, poi.size());
assertEquals(3, poi.size());
final Instance poii = poi.get(0);
assertNotNull(poii);
@ -140,7 +198,7 @@ public class GraphCleaningFunctionsTest {
assertEquals(5, p_out.getTitle().size());
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
assertEquals(3, p_cleaned.getTitle().size());
@ -159,9 +217,12 @@ public class GraphCleaningFunctionsTest {
assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue());
assertEquals("0038", p_cleaned.getInstance().get(2).getInstancetype().getClassid());
assertEquals("Other literature type", p_cleaned.getInstance().get(2).getInstancetype().getClassname());
final List<Instance> pci = p_cleaned.getInstance();
assertNotNull(pci);
assertEquals(1, pci.size());
assertEquals(3, pci.size());
final Instance pcii = pci.get(0);
assertNotNull(pcii);
@ -222,4 +283,27 @@ public class GraphCleaningFunctionsTest {
.readLines(
GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
}
@Test
public void testCleanDoiBoost() throws IOException {
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
}
@Test
public void testCleanDoiBoost2() throws IOException {
String json = IOUtils
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
}
}

View File

@ -171,24 +171,6 @@ public class DumpJobTest {
GraphResult gr = verificationDataset.first();
Assertions.assertEquals(2, gr.getMeasures().size());
Assertions
.assertTrue(
gr
.getMeasures()
.stream()
.anyMatch(
m -> m.getKey().equals("influence")
&& m.getValue().equals("1.62759106106e-08")));
Assertions
.assertTrue(
gr
.getMeasures()
.stream()
.anyMatch(
m -> m.getKey().equals("popularity")
&& m.getValue().equals("0.22519296")));
Assertions.assertEquals(6, gr.getAuthor().size());
Assertions
.assertTrue(

View File

@ -708,7 +708,7 @@ class MappersTest {
assertEquals(1, p.getTitle().size());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
final Publication p_cleaned = cleanup(fixVocabularyNames(p));
final Publication p_cleaned = cleanup(fixVocabularyNames(p), vocs);
assertNotNull(p_cleaned.getTitle());
assertFalse(p_cleaned.getTitle().isEmpty());

Some files were not shown because too many files have changed in this diff Show More