forked from D-Net/dnet-hadoop
Compare commits
42 Commits
master
...
contextPro
Author | SHA1 | Date |
---|---|---|
Miriam Baglioni | 6b79d1cf2a | |
Miriam Baglioni | 87018ac895 | |
Miriam Baglioni | c2ef9e3856 | |
Miriam Baglioni | 59c36eb185 | |
Miriam Baglioni | 2d5f9e8e1c | |
Miriam Baglioni | 066e1dc772 | |
Miriam Baglioni | 00dfaff973 | |
Miriam Baglioni | 464ac6301c | |
Miriam Baglioni | c07f820c21 | |
Miriam Baglioni | 2740b95f99 | |
Miriam Baglioni | ca7e10b3c0 | |
Miriam Baglioni | 2f6673e678 | |
Claudio Atzori | 6b8c357381 | |
Claudio Atzori | c0d2b62e46 | |
Claudio Atzori | a3948c1f6e | |
Claudio Atzori | fddbc8364e | |
Alessia Bardi | 6208b04f1d | |
Sandro La Bruzzo | 9ca438d9b1 | |
Sandro La Bruzzo | 42ff7a5665 | |
Sandro La Bruzzo | ebe6aa6d38 | |
Miriam Baglioni | 0eda93b3eb | |
Miriam Baglioni | 72771a1254 | |
Miriam Baglioni | 6cdc4d3bf3 | |
Miriam Baglioni | a106353cee | |
Miriam Baglioni | 5d8257b288 | |
Claudio Atzori | a4cfabdbc6 | |
Claudio Atzori | 338327171d | |
Claudio Atzori | 6cbda49112 | |
Claudio Atzori | ea9b00ce56 | |
Claudio Atzori | 2e70aa43f0 | |
Antonis Lempesis | 168edcbde3 | |
Antonis Lempesis | 625d993cd9 | |
Antonis Lempesis | 25d0512fbd | |
Miriam Baglioni | 9d617a0a58 | |
Miriam Baglioni | d69c19e3fe | |
Miriam Baglioni | efd34c63ae | |
Miriam Baglioni | 3214101a75 | |
Miriam Baglioni | fe7a7f2415 | |
Miriam Baglioni | a9fbd5b22d | |
Miriam Baglioni | 4c12e9664e | |
Miriam Baglioni | fb9e4a2769 | |
Miriam Baglioni | 7572069f98 |
|
@ -115,6 +115,8 @@ public class AuthorMerger {
|
|||
}
|
||||
|
||||
public static String pidToComparableString(StructuredProperty pid) {
|
||||
if (pid == null)
|
||||
return "";
|
||||
return (pid.getQualifier() != null
|
||||
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
|
||||
: "")
|
||||
|
|
|
@ -7,6 +7,37 @@
|
|||
<version>1.2.4-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
|
||||
<dependencies>
|
||||
|
@ -24,12 +55,6 @@
|
|||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.sun.xml.bind</groupId>
|
||||
<artifactId>jaxb-core</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -37,6 +62,13 @@
|
|||
<artifactId>dhp-schemas</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-mapper</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
|
@ -76,7 +108,10 @@
|
|||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-compress</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-text</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
|
|
@ -0,0 +1,544 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import java.nio.charset.CodingErrorAction
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.LocalDate
|
||||
import java.time.chrono.ThaiBuddhistDate
|
||||
import java.time.format.DateTimeFormatter
|
||||
import java.util.{Date, Locale}
|
||||
import java.util.regex.Pattern
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.{Codec, Source}
|
||||
|
||||
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
|
||||
|
||||
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
|
||||
|
||||
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
|
||||
|
||||
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
|
||||
|
||||
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
|
||||
|
||||
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
|
||||
|
||||
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
|
||||
|
||||
case class DateType(date: Option[String], dateType: Option[String]) {}
|
||||
|
||||
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
|
||||
|
||||
object DataciteToOAFTransformation {
|
||||
val UNKNOWN_REPOSITORY_ORIGINALID = "openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18"
|
||||
val DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254"
|
||||
|
||||
val DNET_DATACITE_DATE = "dnet:dataCite_date"
|
||||
|
||||
val DNET_DATACITE_TITLE = "dnet:dataCite_title"
|
||||
val SYSIMPORT_ACTIONSET = "sysimport:actionset"
|
||||
val DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"
|
||||
|
||||
val PROVENANCE_ACTION_SET_QUALIFIER: Qualifier = OafMapperUtils.qualifier(SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS)
|
||||
|
||||
val MAIN_TITLE_QUALIFIER:Qualifier = OafMapperUtils.qualifier("main title","main title",DNET_DATACITE_TITLE,DNET_DATACITE_TITLE)
|
||||
|
||||
implicit val codec: Codec = Codec("UTF-8")
|
||||
codec.onMalformedInput(CodingErrorAction.REPLACE)
|
||||
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
|
||||
|
||||
val DOI_CLASS = "doi"
|
||||
val SUBJ_CLASS = "keywords"
|
||||
|
||||
|
||||
val j_filter: List[String] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
|
||||
s.lines.toList
|
||||
}
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val unknown_repository: HostedByMapType = HostedByMapType(UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
|
||||
|
||||
val dataInfo: DataInfo = generateDataInfo("0.9")
|
||||
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(DATACITE_ID, "Datacite")
|
||||
|
||||
val hostedByMap: Map[String, HostedByMapType] = {
|
||||
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(s)
|
||||
json.extract[Map[String, HostedByMapType]]
|
||||
}
|
||||
|
||||
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
|
||||
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
|
||||
|
||||
val funder_regex: List[(Pattern, String)] = List(
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
|
||||
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
|
||||
|
||||
)
|
||||
|
||||
val Date_regex: List[Pattern] = List(
|
||||
//Y-M-D
|
||||
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
|
||||
//M-D-Y
|
||||
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
|
||||
//D-M-Y
|
||||
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
|
||||
//Y
|
||||
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
|
||||
)
|
||||
|
||||
|
||||
def filter_json(json: String): Boolean = {
|
||||
j_filter.exists(f => json.contains(f))
|
||||
}
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
td.isAfter(dt)
|
||||
}
|
||||
|
||||
|
||||
def extract_date(input: String): Option[String] = {
|
||||
val d = Date_regex.map(pattern => {
|
||||
val matcher = pattern.matcher(input)
|
||||
if (matcher.find())
|
||||
matcher.group(0)
|
||||
else
|
||||
null
|
||||
}
|
||||
).find(s => s != null)
|
||||
|
||||
if (d.isDefined) {
|
||||
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
|
||||
try {
|
||||
return Some(LocalDate.parse(a_date, df_en).toString)
|
||||
} catch {
|
||||
case _: Throwable => try {
|
||||
return Some(LocalDate.parse(a_date, df_it).toString)
|
||||
} catch {
|
||||
case _: Throwable =>
|
||||
return None
|
||||
}
|
||||
}
|
||||
}
|
||||
d
|
||||
}
|
||||
|
||||
|
||||
def fix_thai_date(input:String, format:String) :String = {
|
||||
try {
|
||||
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
|
||||
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
|
||||
LocalDate.from(d).toString
|
||||
} catch {
|
||||
case _: Throwable => ""
|
||||
}
|
||||
}
|
||||
|
||||
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
|
||||
if (resourceType != null && resourceType.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
}
|
||||
if (schemaOrg != null && schemaOrg.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
|
||||
}
|
||||
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
|
||||
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
|
||||
if (typeQualifier != null)
|
||||
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
|
||||
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (typeQualifiers == null)
|
||||
return null
|
||||
val i = new Instance
|
||||
i.setInstancetype(typeQualifiers._1)
|
||||
typeQualifiers._2.getClassname match {
|
||||
case "dataset" =>
|
||||
val r = new OafDataset
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "publication" =>
|
||||
val r = new Publication
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "software" =>
|
||||
val r = new Software
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
case "other" =>
|
||||
val r = new OtherResearchProduct
|
||||
r.setInstance(List(i).asJava)
|
||||
return r
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def available_date(input: String): Boolean = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: org.json4s.JValue = parse(input)
|
||||
val l: List[String] = for {
|
||||
JObject(dates) <- json \\ "dates"
|
||||
JField("dateType", JString(dateTypes)) <- dates
|
||||
} yield dateTypes
|
||||
|
||||
l.exists(p => p.equalsIgnoreCase("available"))
|
||||
|
||||
}
|
||||
|
||||
|
||||
def OPEN_ACCESS_RIGHT = {
|
||||
val result = new Qualifier
|
||||
result.setClassid("OPEN")
|
||||
result.setClassid("OPEN")
|
||||
result.setSchemeid(ModelConstants.DNET_ACCESS_MODES)
|
||||
result.setSchemename(ModelConstants.DNET_ACCESS_MODES)
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* As describe in ticket #6377
|
||||
* when the result come from figshare we need to remove subject
|
||||
* and set Access rights OPEN.
|
||||
* @param r
|
||||
*/
|
||||
def fix_figshare(r: Result): Unit = {
|
||||
|
||||
if (r.getInstance() != null) {
|
||||
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
|
||||
if (hosted_by_figshare) {
|
||||
r.getInstance().asScala.foreach(i => i.setAccessright(OPEN_ACCESS_RIGHT))
|
||||
val l: List[StructuredProperty] = List()
|
||||
r.setSubject(l.asJava)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
|
||||
OafMapperUtils.structuredProperty(dt, q, null)
|
||||
}
|
||||
|
||||
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
r.setTarget(targetId)
|
||||
r.setRelType(ModelConstants.RESULT_PROJECT)
|
||||
r.setRelClass(relClass)
|
||||
r.setSubRelType(ModelConstants.OUTCOME)
|
||||
r.setCollectedfrom(List(cf).asJava)
|
||||
r.setDataInfo(di)
|
||||
r
|
||||
|
||||
|
||||
}
|
||||
|
||||
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
|
||||
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
|
||||
|
||||
if (match_pattern.isDefined) {
|
||||
val m = match_pattern.get._1
|
||||
val p = match_pattern.get._2
|
||||
val grantId = m.matcher(awardUri).replaceAll("$2")
|
||||
val targetId = s"$p${DHPUtils.md5(grantId)}"
|
||||
List(
|
||||
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
|
||||
generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
|
||||
)
|
||||
}
|
||||
else
|
||||
List()
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup): List[Oaf] = {
|
||||
if (filter_json(input))
|
||||
return List()
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json = parse(input)
|
||||
|
||||
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
|
||||
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
|
||||
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
|
||||
|
||||
val doi = (json \ "attributes" \ "doi").extract[String]
|
||||
if (doi.isEmpty)
|
||||
return List()
|
||||
|
||||
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
|
||||
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
|
||||
if (result == null)
|
||||
return List()
|
||||
|
||||
|
||||
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
|
||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||
result.setPid(List(pid).asJava)
|
||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||
result.setOriginalId(List(doi).asJava)
|
||||
|
||||
val d = new Date(dateOfCollection * 1000)
|
||||
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
|
||||
|
||||
|
||||
result.setDateofcollection(ISO8601FORMAT.format(d))
|
||||
result.setDateoftransformation(ISO8601FORMAT.format(ts))
|
||||
result.setDataInfo(dataInfo)
|
||||
|
||||
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
|
||||
|
||||
|
||||
val authors = creators.zipWithIndex.map { case (c, idx) =>
|
||||
val a = new Author
|
||||
a.setFullname(c.name.orNull)
|
||||
a.setName(c.givenName.orNull)
|
||||
a.setSurname(c.familyName.orNull)
|
||||
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
|
||||
a.setPid(c.nameIdentifiers.get.map(ni => {
|
||||
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
|
||||
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
|
||||
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
|
||||
}
|
||||
else
|
||||
null
|
||||
|
||||
}
|
||||
)
|
||||
.asJava)
|
||||
}
|
||||
if (c.affiliation.isDefined)
|
||||
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
|
||||
a.setRank(idx + 1)
|
||||
a
|
||||
}
|
||||
|
||||
|
||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||
|
||||
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
|
||||
if (t.titleType.isEmpty) {
|
||||
OafMapperUtils.structuredProperty(t.title.get, MAIN_TITLE_QUALIFIER, null)
|
||||
} else {
|
||||
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, DNET_DATACITE_TITLE, DNET_DATACITE_TITLE, null)
|
||||
}
|
||||
}).asJava)
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
result.setAuthor(authors.asJava)
|
||||
|
||||
val dates = (json \\ "dates").extract[List[DateType]]
|
||||
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
||||
|
||||
val i_date = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
val a_date: Option[String] = dates
|
||||
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
|
||||
.map(d => extract_date(d.date.get))
|
||||
.find(d => d != null && d.isDefined)
|
||||
.map(d => d.get)
|
||||
|
||||
if (a_date.isDefined) {
|
||||
if(doi.startsWith("10.14457"))
|
||||
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
|
||||
else
|
||||
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
|
||||
}
|
||||
if (i_date.isDefined && i_date.get.isDefined) {
|
||||
if(doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
|
||||
}
|
||||
else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
|
||||
}
|
||||
}
|
||||
else if (publication_year != null) {
|
||||
if(doi.startsWith("10.14457")) {
|
||||
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
|
||||
|
||||
} else {
|
||||
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
|
||||
.map(d => (extract_date(d.date.get), d.dateType.get))
|
||||
.filter(d => d._1.isDefined)
|
||||
.map(d => (d._1.get, vocabularies.getTermAsQualifier(DNET_DATACITE_DATE, d._2.toLowerCase())))
|
||||
.filter(d => d._2 != null)
|
||||
.map(d => generateOAFDate(d._1, d._2)).asJava)
|
||||
|
||||
val subjects = (json \\ "subjects").extract[List[SubjectType]]
|
||||
|
||||
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
|
||||
.map(s =>
|
||||
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
|
||||
).asJava)
|
||||
|
||||
|
||||
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
|
||||
|
||||
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
|
||||
|
||||
result.setDescription(
|
||||
descriptions
|
||||
.filter(d => d.description.isDefined).
|
||||
map(d =>
|
||||
OafMapperUtils.field(d.description.get, null)
|
||||
).filter(s => s != null).asJava)
|
||||
|
||||
|
||||
val publisher = (json \\ "publisher").extractOrElse[String](null)
|
||||
if (publisher != null)
|
||||
result.setPublisher(OafMapperUtils.field(publisher, null))
|
||||
|
||||
|
||||
val language: String = (json \\ "language").extractOrElse[String](null)
|
||||
|
||||
if (language != null)
|
||||
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
|
||||
|
||||
|
||||
val instance = result.getInstance().get(0)
|
||||
|
||||
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
|
||||
|
||||
val accessRights: List[String] = for {
|
||||
JObject(rightsList) <- json \\ "rightsList"
|
||||
JField("rightsUri", JString(rightsUri)) <- rightsList
|
||||
} yield rightsUri
|
||||
|
||||
val aRights: Option[Qualifier] = accessRights.map(r => {
|
||||
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
|
||||
}).find(q => q != null).map(q => {
|
||||
val a = new Qualifier
|
||||
a.setClassid(q.getClassid)
|
||||
a.setClassname(q.getClassname)
|
||||
a.setSchemeid(q.getSchemeid)
|
||||
a.setSchemename(q.getSchemename)
|
||||
a
|
||||
})
|
||||
|
||||
|
||||
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
|
||||
|
||||
if (client.isDefined) {
|
||||
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
|
||||
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
|
||||
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
|
||||
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
|
||||
instance.setAccessright(access_rights_qualifier)
|
||||
val license = accessRights
|
||||
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
|
||||
if (license.isDefined)
|
||||
instance.setLicense(OafMapperUtils.field(license.get, null))
|
||||
}
|
||||
|
||||
val awardUris: List[String] = for {
|
||||
JObject(fundingReferences) <- json \\ "fundingReferences"
|
||||
JField("awardUri", JString(awardUri)) <- fundingReferences
|
||||
} yield awardUri
|
||||
|
||||
val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
|
||||
fix_figshare(result)
|
||||
if (relations != null && relations.nonEmpty) {
|
||||
List(result) ::: relations
|
||||
}
|
||||
else
|
||||
List(result)
|
||||
}
|
||||
|
||||
def generateDataInfo(trust: String): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(PROVENANCE_ACTION_SET_QUALIFIER)
|
||||
di
|
||||
}
|
||||
|
||||
def generateDSId(input: String): String = {
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|$b::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object ExportActionSetJobNode {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(ExportActionSetJobNode.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
|
||||
|
||||
spark.read.load(sourcePath).as[Oaf]
|
||||
.map(o =>DataciteToOAFTransformation.toActionSet(o))
|
||||
.filter(o => o!= null)
|
||||
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object GenerateDataciteDatasetSpark {
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
val master = parser.get("master")
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
val isLookupUrl: String = parser.get("isLookupUrl")
|
||||
log.info("isLookupUrl: {}", isLookupUrl)
|
||||
|
||||
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
|
||||
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
|
||||
val spark: SparkSession = SparkSession.builder().config(conf)
|
||||
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
|
||||
.master(master)
|
||||
.getOrCreate()
|
||||
|
||||
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
spark.read.load(sourcePath).as[DataciteType]
|
||||
.filter(d => d.isActive)
|
||||
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies))
|
||||
.filter(d => d != null)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
}
|
|
@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
@ -33,7 +32,6 @@ public class PrepareProjects {
|
|||
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
|
||||
public class PrepareCrossrefSpark {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareCrossrefSpark.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
selectResult(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static Dataset<Result> selectResult(SparkSession spark, String input_path, String output_path) {
|
||||
Dataset<Result> res = Utils
|
||||
.readPath(
|
||||
spark, input_path, Result.class)
|
||||
.filter(
|
||||
(FilterFunction<Result>) r -> !r.getId().startsWith("50|dedup") &&
|
||||
r.getCf().stream().anyMatch(cf -> cf.getValue().equals("Crossref")));
|
||||
|
||||
res.write().option("compression", "gzip").mode(SaveMode.Overwrite).json(output_path);
|
||||
return res;
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
|
||||
public class PrepareDataciteSpark {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareDataciteSpark.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
exec(spark, outputPath, inputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void exec(SparkSession spark, String output_path, String input_path) {
|
||||
|
||||
Dataset<Result> datacite = Utils
|
||||
.readPath(
|
||||
spark, input_path, Result.class)
|
||||
.filter(
|
||||
(FilterFunction<Result>) r -> r.getId().startsWith("50|datacite"));
|
||||
|
||||
datacite.write().option("compression", "gzip").mode(SaveMode.Overwrite).json(output_path + "allDatacite");
|
||||
getProviderResult(output_path, datacite, "Zenodo");
|
||||
getProviderResult(output_path, datacite, "Figshare");
|
||||
getProviderResult(output_path, datacite, "Dryad");
|
||||
|
||||
}
|
||||
|
||||
private static void getProviderResult(String output_path, Dataset<Result> datacite, String provider) {
|
||||
datacite
|
||||
.filter(
|
||||
(FilterFunction<Result>) r -> r
|
||||
.getPid()
|
||||
.stream()
|
||||
.anyMatch(p -> p.getKey().equals("doi") && p.getValue().contains(provider.toLowerCase())))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(output_path + provider);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
|
||||
|
||||
public class PrepareNormalizedOrcid {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareNormalizedOrcid.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
execNormalize(spark, outputPath, inputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void execNormalize(SparkSession spark, String outputPath, String inputPath) {
|
||||
Dataset<Orcid> orcid = Utils.readPath(spark, inputPath, Orcid.class);
|
||||
orcid.map((MapFunction<Orcid, Orcid>) o -> {
|
||||
o.setName(Utils.normalizeString(o.getName()));
|
||||
o.setSurname(Utils.normalizeString(o.getSurname()));
|
||||
o.setCreditname(Utils.normalizeString(o.getCreditname()));
|
||||
o
|
||||
.setOtherNames(
|
||||
o
|
||||
.getOtherNames()
|
||||
.stream()
|
||||
.map(on -> Utils.normalizeString(on))
|
||||
.collect(Collectors.toList()));
|
||||
return o;
|
||||
}, Encoders.bean(Orcid.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Author;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
|
||||
public class PrepareNormalizedResultSpark {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareNormalizedResultSpark.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/prepare_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
execNormalize(spark, outputPath, inputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void execNormalize(SparkSession spark, String outputPath, String inputPath) {
|
||||
Dataset<Author> normalized_result = Utils
|
||||
.readPath(spark, inputPath + "publicationsWithOrcid", Author.class)
|
||||
.union(Utils.readPath(spark, inputPath + "datasetWithOrcid", Author.class))
|
||||
.union(Utils.readPath(spark, inputPath + "softwareWithOrcid", Author.class))
|
||||
.union(Utils.readPath(spark, inputPath + "otherWithOrcid", Author.class))
|
||||
.map((MapFunction<Author, Author>) r -> {
|
||||
r.setName(Utils.normalizeString(r.getName()));
|
||||
r.setSurname(Utils.normalizeString(r.getSurname()));
|
||||
r.setFullname(Utils.normalizeString(r.getFullname()));
|
||||
return r;
|
||||
}, Encoders.bean(Author.class));
|
||||
|
||||
normalized_result
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath + "ResultAuthorNormalized");
|
||||
|
||||
normalized_result
|
||||
.filter((FilterFunction<Author>) r -> !r.getId().startsWith("50|dedup"))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath + "collectedResultWithOrcid");
|
||||
|
||||
normalized_result
|
||||
.filter((FilterFunction<Author>) r -> !r.getDeletedbyinference())
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath + "notDeletedByInferenceResultWithOrcid");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class PrepareResultAllTheRestSpark {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareResultAllTheRestSpark.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/prepare_alltherest_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
final String instRepoPath = parser.get("instRepoPath");
|
||||
final String crossrefPath = parser.get("crossrefPath");
|
||||
final String datacitePath = parser.get("datacitePath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath + "allTheRest");
|
||||
exec(
|
||||
spark, outputPath + "allTheRest",
|
||||
inputPath, instRepoPath,
|
||||
datacitePath, crossrefPath);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Leggo tutti i result di crossref, datacite ed associati agli institutional repositories
|
||||
* Leggo tutti i result collezionati
|
||||
* faccio una left join tra i result collezionati e quelli letti al passo precedente
|
||||
* prendo quelli che non hanno un match nella join
|
||||
* @param spark
|
||||
* @param output_path
|
||||
* @param result_path
|
||||
*/
|
||||
private static void exec(SparkSession spark, String output_path, String result_path, String inst_repo_path,
|
||||
String datacite_path, String crossref_path) {
|
||||
|
||||
Dataset<Result> result = Utils.readPath(spark, result_path, Result.class);
|
||||
|
||||
Dataset<Result> inst_repo = Utils
|
||||
.readPath(spark, inst_repo_path, Result.class);
|
||||
|
||||
Dataset<Result> datacite = Utils
|
||||
.readPath(spark, datacite_path, Result.class);
|
||||
|
||||
Dataset<Result> crossref = Utils
|
||||
.readPath(spark, crossref_path, Result.class);
|
||||
|
||||
Dataset<Result> union_dataset = inst_repo.union(datacite).union(crossref);
|
||||
|
||||
result
|
||||
.joinWith(union_dataset, result.col("id").equalTo(union_dataset.col("id")), "left")
|
||||
.map((MapFunction<Tuple2<Result, Result>, Result>) t2 -> {
|
||||
if (!Optional.ofNullable(t2._2()).isPresent())
|
||||
return t2._1();
|
||||
return null;
|
||||
}, Encoders.bean(Result.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(output_path);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
|
||||
public class PrepareResultFromInstRepo {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareResultFromInstRepo.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/prepare_instrepo_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
final String datasourcePath = parser.get("datasourcePath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
selectResultFromInstRepo(spark, inputPath, outputPath, datasourcePath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void selectResultFromInstRepo(SparkSession spark, String inputPath, String output_path,
|
||||
String datasourcePath) {
|
||||
Dataset<Datasource> datasource = Utils.readPath(spark, datasourcePath, Datasource.class);
|
||||
Dataset<Result> res = Utils
|
||||
.readPath(
|
||||
spark, inputPath, Result.class)
|
||||
.filter(
|
||||
(FilterFunction<Result>) r -> !r.getId().startsWith("50|doiboost")
|
||||
&& !r.getId().startsWith("50|scholix")
|
||||
&& !r.getId().startsWith("50|datacite")
|
||||
&& !r.getId().startsWith("50|dedup"));
|
||||
|
||||
datasource.createOrReplaceTempView("datasource");
|
||||
res.createOrReplaceTempView("result");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"SELECT t.id, t.deletedbyinference, t.name, t.surname, t.cf, t.fullname, t.pid, t.oid " +
|
||||
"FROM " +
|
||||
"(Select * " +
|
||||
"from result " +
|
||||
"LATERAL VIEW explode(cf.key) c as cfromkey) as t " +
|
||||
"join " +
|
||||
"datasource d " +
|
||||
"on " +
|
||||
"d.id = t.cfromkey " +
|
||||
"and d.datasourcetype.classid = 'pubsrepository::institutional'")
|
||||
.as(Encoders.bean(Result.class))
|
||||
.write()
|
||||
.option("compressio", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(output_path);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Author;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.KeyValue;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
||||
public class PrepareResultSpark {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareResultSpark.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/prepare_result_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String resultClassName = parser.get("resultClass");
|
||||
|
||||
Class<? extends eu.dnetlib.dhp.schema.oaf.Result> resultClazz = (Class<? extends eu.dnetlib.dhp.schema.oaf.Result>) Class
|
||||
.forName(resultClassName);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
mapToResult(spark, inputPath, resultClazz, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <R extends eu.dnetlib.dhp.schema.oaf.Result> void mapToResult(SparkSession spark,
|
||||
String input_path,
|
||||
Class<R> resultClazz, String output_path) {
|
||||
Dataset<R> publicationDataset = Utils.readPath(spark, input_path, resultClazz);
|
||||
Dataset<R> result = publicationDataset.filter((FilterFunction<R>) p -> {
|
||||
if (p.getAuthor() == null)
|
||||
return false;
|
||||
if (p.getAuthor().size() == 0)
|
||||
return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
result.flatMap((FlatMapFunction<R, Author>) p -> {
|
||||
List<Author> reslist = new ArrayList<>();
|
||||
p.getAuthor().forEach(a -> {
|
||||
Author r = new Author();
|
||||
r.setDeletedbyinference(p.getDataInfo().getDeletedbyinference());
|
||||
r.setId(p.getId());
|
||||
|
||||
r
|
||||
.setCf(
|
||||
p
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
r.setName(a.getName());
|
||||
r.setSurname(a.getSurname());
|
||||
r.setFullname(a.getFullname());
|
||||
r
|
||||
.setPid(
|
||||
p
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(
|
||||
pid -> KeyValue
|
||||
.newInstance(pid.getQualifier().getClassid(), pid.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
r
|
||||
.setApid(
|
||||
Optional
|
||||
.ofNullable(a.getPid())
|
||||
.map(
|
||||
pids -> pids
|
||||
.stream()
|
||||
.map(pd -> KeyValue.newInstance(pd.getQualifier().getClassid(), pd.getValue()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(new ArrayList<>()));
|
||||
reslist.add(r);
|
||||
|
||||
});
|
||||
return reslist.iterator();
|
||||
}, Encoders.bean(Author.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(output_path);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Author;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.KeyValue;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
||||
public class SelectAuthorWithOrcidOnlySpark {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
PrepareResultSpark.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/prepare_result_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
selectAuthors(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void selectAuthors(SparkSession spark, String input_path, String output_path) {
|
||||
Dataset<Author> resultDataset = Utils.readPath(spark, input_path, Author.class);
|
||||
resultDataset.flatMap((FlatMapFunction<Author, Result>) p -> {
|
||||
List<Result> reslist = new ArrayList<>();
|
||||
p.getApid().forEach(a -> {
|
||||
if (a.getKey().equals(ModelConstants.ORCID_PENDING) || a.getKey().equals(ModelConstants.ORCID)) {
|
||||
Result r = Result.fromAuthor(p);
|
||||
r.setOid(a.getValue());
|
||||
reslist.add(r);
|
||||
}
|
||||
});
|
||||
return reslist.iterator();
|
||||
}, Encoders.bean(Result.class))
|
||||
.write()
|
||||
.option("compressio", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(output_path);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,268 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.text.Normalizer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.text.similarity.CosineDistance;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.wcohen.ss.JaroWinkler;
|
||||
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class Utils implements Serializable {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(eu.dnetlib.dhp.ircdl_extention.Utils.class);
|
||||
|
||||
public static String normalizeString(String input) {
|
||||
if (input == null || input.equals("void"))
|
||||
return new String();
|
||||
String tmp = Normalizer
|
||||
.normalize(input, Normalizer.Form.NFKD)
|
||||
.replaceAll("[^\\p{ASCII}]", "");
|
||||
tmp = tmp
|
||||
.replaceAll("[^\\p{Alpha}]+", " ")
|
||||
.replaceAll("\\s+", " ")
|
||||
.trim();
|
||||
return tmp;
|
||||
|
||||
}
|
||||
|
||||
public static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
private static List<String> getList(List<String> input) {
|
||||
return input
|
||||
.stream()
|
||||
.map(st -> st.trim())
|
||||
.filter(st -> st.length() > 0)
|
||||
.sorted()
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
private static List<String> getListInitials(List<String> input) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
List<Character> tmp = input
|
||||
.stream()
|
||||
.map(st -> st.trim())
|
||||
.filter(st -> st.length() > 0)
|
||||
.map(st -> st.charAt(0))
|
||||
.sorted()
|
||||
.collect(Collectors.toList());
|
||||
if (tmp.size() == 1)
|
||||
ret.add(String.valueOf(tmp.get(0)));
|
||||
for (int i = 0; i < tmp.size(); i++) {
|
||||
for (int j = i + 1; j < tmp.size(); j++) {
|
||||
ret.add(String.valueOf(tmp.get(i)) + String.valueOf(tmp.get(j)));
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
// selezione delle coppie di primi caratteri per ogni parola che compone il nome
|
||||
// se ci sono match il nome e' giusto
|
||||
// aggiungere verifica che la lunghezza delle liste non sia troppo sbilanciata: se una lista e' lunga
|
||||
public static boolean conservativeFilterFunction(Tuple2<Result, Orcid> input) {
|
||||
|
||||
List<String> res = getListInitials(Arrays.asList(input._1().getFullname().split(" ")));
|
||||
Orcid or = input._2();
|
||||
List<String> tmp = new ArrayList<>();
|
||||
Collections.addAll(tmp, or.getName().split(" "));
|
||||
Collections.addAll(tmp, or.getSurname().split(" "));
|
||||
return checkContains(
|
||||
res, getListInitials(tmp), false)
|
||||
||
|
||||
checkContains(
|
||||
res, getListInitials(Arrays.asList(or.getCreditname().split(" "))), false)
|
||||
||
|
||||
or
|
||||
.getOtherNames()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
on -> checkContains(
|
||||
res, getListInitials(Arrays.asList(on.split(" "))), false));
|
||||
|
||||
}
|
||||
|
||||
public static boolean filterFunction(Tuple2<Result, Orcid> input) throws JsonProcessingException {
|
||||
|
||||
try {
|
||||
List<String> res = getList(Arrays.asList(input._1().getFullname().split(" ")));
|
||||
Orcid or = input._2();
|
||||
List<String> tmp = new ArrayList<>();
|
||||
Collections.addAll(tmp, or.getName().split(" "));
|
||||
Collections.addAll(tmp, or.getSurname().split(" "));
|
||||
return checkContains(
|
||||
res, getList(tmp)
|
||||
.stream()
|
||||
.sorted()
|
||||
.collect(Collectors.toList()))
|
||||
||
|
||||
checkContains(
|
||||
res, getList(Arrays.asList(or.getCreditname().split(" ")))
|
||||
.stream()
|
||||
.sorted()
|
||||
.collect(Collectors.toList()))
|
||||
||
|
||||
or
|
||||
.getOtherNames()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
on -> checkContains(
|
||||
res, getList(Arrays.asList(on.split(" ")))
|
||||
.stream()
|
||||
.sorted()
|
||||
.collect(Collectors.toList())));
|
||||
} catch (Exception e) {
|
||||
|
||||
log.info("EXCEPTIONNNN: " + new ObjectMapper().writeValueAsString(input));
|
||||
throw e;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static boolean checkContains(List<String> result, List<String> orcid) {
|
||||
return checkContains(result, orcid, true);
|
||||
}
|
||||
|
||||
private static boolean checkContains(List<String> result, List<String> orcid, boolean jaro) {
|
||||
if (result.size() == 0 || orcid.size() == 0) {
|
||||
return true;
|
||||
}
|
||||
String[][] input = {
|
||||
{
|
||||
"1", StringUtils.joinWith(" ", result)
|
||||
},
|
||||
{
|
||||
"2", StringUtils.joinWith(" ", orcid)
|
||||
}
|
||||
};
|
||||
// exact match word by word
|
||||
Double cosineDistance = new CosineDistance().apply(input[0][1], input[1][1]);
|
||||
if (Math.round((1 - cosineDistance) * 100) == 100) {
|
||||
return true;
|
||||
}
|
||||
// check containment one list can be greater than the other, and also composition of words to create the name
|
||||
// e.g. pengli yan = li peng yan
|
||||
if (orcid.size() < result.size()) {
|
||||
if (isIn(orcid, result))
|
||||
return true;
|
||||
} else {
|
||||
if (isIn(result, orcid))
|
||||
return true;
|
||||
}
|
||||
if (jaro) {
|
||||
// apply JaroWinkler distance
|
||||
double score = new JaroWinkler()
|
||||
.score(StringUtils.joinWith(" ", result), StringUtils.joinWith(" ", orcid));
|
||||
return score > 0.95;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean isIn(List<String> lst1, List<String> lst2) {
|
||||
|
||||
int index = 0;
|
||||
for (String word : lst1) {
|
||||
int i = index;
|
||||
boolean found = false;
|
||||
while (i < lst2.size()) {
|
||||
String wordlist = lst2.get(i);
|
||||
if (word.equals(wordlist)) {
|
||||
index = i + 1;
|
||||
i = lst2.size();
|
||||
found = true;
|
||||
} else {
|
||||
if (word.charAt(0) < wordlist.charAt(0)) {
|
||||
if (!checkComposition(word, lst2)) {
|
||||
return false;
|
||||
} else {
|
||||
index = 0;
|
||||
i = lst2.size();
|
||||
found = true;
|
||||
}
|
||||
} else {
|
||||
if (word.length() == 1 || wordlist.length() == 1) {
|
||||
if (word.charAt(0) == wordlist.charAt(0)) {
|
||||
index = i + 1;
|
||||
i = lst2.size();
|
||||
found = true;
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
if (!found) {
|
||||
if (!checkComposition(word, lst2)) {
|
||||
return false;
|
||||
} else {
|
||||
index = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean checkComposition(String word, List<String> lst2) {
|
||||
for (int i = 0; i < lst2.size(); i++) {
|
||||
for (int j = 0; j < lst2.size(); j++) {
|
||||
if (i != j) {
|
||||
String w = lst2.get(i) + lst2.get(j);
|
||||
if (word.equals(w)) {
|
||||
if (i > j) {
|
||||
lst2.remove(i);
|
||||
lst2.remove(j);
|
||||
} else {
|
||||
lst2.remove(j);
|
||||
lst2.remove(i);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Author;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.ShuffleInfo;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class WrongSpark {
|
||||
/**
|
||||
* takes as input the orcid normalized and the entry normalized to be checked against orcid
|
||||
* returns the lower bound of wrong attribution
|
||||
*/
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
WrongSpark.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/wrong_orcid_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
final String orcidPath = parser.get("orcidPath");
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
|
||||
final String resultPath = parser.get("inputPath");
|
||||
|
||||
final String authorPath = parser.get("authorPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
|
||||
|
||||
runWithSparkHiveSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
findWrong(spark, orcidPath, outputPath + "/wrong", resultPath);
|
||||
findShuffle(spark, orcidPath, outputPath + "/shuffle", resultPath, authorPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void findShuffle(SparkSession spark, String orcidPath, String outputPath, String resultPath,
|
||||
String authorPath) {
|
||||
|
||||
Utils
|
||||
.readPath(spark, authorPath, Author.class)
|
||||
.map(
|
||||
(MapFunction<Author, ShuffleInfo>) r -> ShuffleInfo
|
||||
.newInstance(r.getName(), r.getSurname(), r.getFullname(), r.getId()),
|
||||
Encoders.bean(ShuffleInfo.class))
|
||||
.union(
|
||||
getWrong(spark, orcidPath, resultPath)
|
||||
.map((MapFunction<Tuple2<Result, Orcid>, ShuffleInfo>) t2 ->
|
||||
|
||||
ShuffleInfo
|
||||
.newInstance(
|
||||
t2._1().getName(), t2._1().getSurname(), t2._1().getFullname(),
|
||||
t2._1().getId(), t2._2().getName(), t2._2().getSurname(),
|
||||
t2._2().getCreditname(), t2._2().getOtherNames(), t2._2().getOrcid()),
|
||||
Encoders.bean(ShuffleInfo.class)))
|
||||
.groupByKey((MapFunction<ShuffleInfo, String>) si -> si.getId(), Encoders.STRING())
|
||||
.flatMapGroups((FlatMapGroupsFunction<String, ShuffleInfo, ShuffleInfo>) (s, it) -> {
|
||||
List<ShuffleInfo> shuffleInfoList = new ArrayList();
|
||||
List<ShuffleInfo> ret = new ArrayList<>();
|
||||
shuffleInfoList.add(it.next());
|
||||
it.forEachRemaining(e -> shuffleInfoList.add(e));
|
||||
shuffleInfoList
|
||||
.stream()
|
||||
.filter(e -> Optional.ofNullable(e.getOrcid()).isPresent())
|
||||
.forEach(e -> {
|
||||
if (checkShuffle(e, shuffleInfoList))
|
||||
ret.add(e);
|
||||
});
|
||||
return ret.iterator();
|
||||
}, Encoders.bean(ShuffleInfo.class))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
/*
|
||||
* def checkShuffle(x): alis = [a for a in x[1]] dic = {} count = 0 for entry in alis: if entry['orcid'] != '':
|
||||
* dic[entry['orcid']] = entry for orcid in dic: name = dic[orcid]['oname'] surname = dic[orcid]['osurname'] for
|
||||
* author in alis: if author['aname'] == "" or author['asurname'] == "": if checkContains([author['afullname']],
|
||||
* addInListAll([], name + " " + surname)): count += 1 break else: if checkContains([author['aname'] + " " +
|
||||
* author['asurname']], addInListAll([], name + " " + surname)): count += 1 break return count
|
||||
*/
|
||||
// candidate_shuffle = zenodo_normalized.map(lambda x: (x['id'], {'aname':x['name'], 'asurname': x['surname'],
|
||||
// 'afullname': x['fullname'], 'oname':"", 'osurname':"", 'orcid':''})). union (
|
||||
// join_orcid_filtered.map(lambda e: (e['id'], {'aname':e['nameg'], 'asurname':e['surnameg'],
|
||||
// 'afullname':e['fullnameg'], 'oname':e['name'],
|
||||
// 'osurname':e['surname'],'orcid':e['orcid']}))).groupByKey().filter(toBeChecked)
|
||||
}
|
||||
|
||||
private static boolean checkShuffle(ShuffleInfo e, List<ShuffleInfo> shuffleInfoList) {
|
||||
|
||||
boolean b = shuffleInfoList
|
||||
.stream()
|
||||
.anyMatch(
|
||||
si -> {
|
||||
try {
|
||||
return Utils
|
||||
.filterFunction(
|
||||
new Tuple2<>(Result.newInstance(si.getAfullname()),
|
||||
Orcid
|
||||
.newInstance(
|
||||
e.getOname(), e.getOsurname(), e.getOcreditName(),
|
||||
e.getoOtherNames())));
|
||||
} catch (JsonProcessingException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
});
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
private static Dataset<Tuple2<Result, Orcid>> getWrong(SparkSession spark, String orcidPath, String resultPath) {
|
||||
Dataset<Orcid> orcidDataset = Utils
|
||||
.readPath(spark, orcidPath, Orcid.class)
|
||||
.filter((FilterFunction<Orcid>) o -> !o.getName().contains("deactivated"));
|
||||
Dataset<Result> resultDataset = Utils.readPath(spark, resultPath, Result.class);
|
||||
|
||||
return resultDataset
|
||||
.joinWith(
|
||||
orcidDataset, resultDataset
|
||||
.col("oid")
|
||||
.equalTo(orcidDataset.col("orcid")),
|
||||
"inner")
|
||||
.filter((FilterFunction<Tuple2<Result, Orcid>>) t2 -> !Utils.conservativeFilterFunction(t2));
|
||||
}
|
||||
|
||||
private static void findWrong(SparkSession spark, String orcidPath, String outputPath, String resultPath) {
|
||||
getWrong(spark, orcidPath, resultPath)
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Author extends Result implements Serializable {
|
||||
|
||||
private List<KeyValue> apid;
|
||||
|
||||
public List<KeyValue> getApid() {
|
||||
return apid;
|
||||
}
|
||||
|
||||
public void setApid(List<KeyValue> apid) {
|
||||
this.apid = apid;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class KeyValue implements Serializable {
|
||||
private String key;
|
||||
private String value;
|
||||
|
||||
public static KeyValue newInstance(String key, String value) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.key = key;
|
||||
kv.value = value;
|
||||
|
||||
return kv;
|
||||
}
|
||||
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
public void setKey(String key) {
|
||||
this.key = key;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Orcid implements Serializable {
|
||||
private List<String> otherNames;
|
||||
private String inception;
|
||||
private String surname;
|
||||
private String mode;
|
||||
private String creditname;
|
||||
private String orcid;
|
||||
private Boolean works;
|
||||
private String name;
|
||||
|
||||
public static Orcid newInstance(String oname, String osurname, String ocreditName, List<String> oOtherNames) {
|
||||
Orcid o = new Orcid();
|
||||
o.name = oname;
|
||||
o.surname = osurname;
|
||||
o.creditname = ocreditName;
|
||||
o.otherNames = oOtherNames;
|
||||
return o;
|
||||
}
|
||||
|
||||
public List<String> getOtherNames() {
|
||||
return otherNames;
|
||||
}
|
||||
|
||||
public void setOtherNames(List<String> otherNames) {
|
||||
this.otherNames = otherNames;
|
||||
}
|
||||
|
||||
public String getInception() {
|
||||
return inception;
|
||||
}
|
||||
|
||||
public void setInception(String inception) {
|
||||
this.inception = inception;
|
||||
}
|
||||
|
||||
public String getSurname() {
|
||||
return surname;
|
||||
}
|
||||
|
||||
public void setSurname(String surname) {
|
||||
this.surname = surname;
|
||||
}
|
||||
|
||||
public String getMode() {
|
||||
return mode;
|
||||
}
|
||||
|
||||
public void setMode(String mode) {
|
||||
this.mode = mode;
|
||||
}
|
||||
|
||||
public String getCreditname() {
|
||||
return creditname;
|
||||
}
|
||||
|
||||
public void setCreditname(String creditname) {
|
||||
this.creditname = creditname;
|
||||
}
|
||||
|
||||
public String getOrcid() {
|
||||
return orcid;
|
||||
}
|
||||
|
||||
public void setOrcid(String oid) {
|
||||
this.orcid = oid;
|
||||
}
|
||||
|
||||
public Boolean getWorks() {
|
||||
return works;
|
||||
}
|
||||
|
||||
public void setWorks(Boolean works) {
|
||||
this.works = works;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,109 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Result implements Serializable {
|
||||
private Boolean deletedbyinference;
|
||||
private String id;
|
||||
private List<KeyValue> cf;
|
||||
private List<KeyValue> pid;
|
||||
private String name;
|
||||
private String surname;
|
||||
private String fullname;
|
||||
private String oid;
|
||||
|
||||
public static Result newInstance(String afullname) {
|
||||
Result r = new Result();
|
||||
r.fullname = afullname;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public static Result fromAuthor(Author p) {
|
||||
Result r = new Result();
|
||||
r.deletedbyinference = p.getDeletedbyinference();
|
||||
r.id = p.getId();
|
||||
r.cf = p.getCf();
|
||||
r.pid = p.getPid();
|
||||
r.name = p.getName();
|
||||
r.surname = p.getSurname();
|
||||
r.fullname = p.getFullname();
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
public Boolean getDeletedbyinference() {
|
||||
return deletedbyinference;
|
||||
}
|
||||
|
||||
public void setDeletedbyinference(Boolean deletedbyinference) {
|
||||
this.deletedbyinference = deletedbyinference;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<KeyValue> getCf() {
|
||||
return cf;
|
||||
}
|
||||
|
||||
public void setCf(List<KeyValue> cf) {
|
||||
this.cf = cf;
|
||||
}
|
||||
|
||||
public List<KeyValue> getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(List<KeyValue> pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
if (name != null)
|
||||
this.name = name.toLowerCase();
|
||||
else
|
||||
this.name = new String();
|
||||
}
|
||||
|
||||
public String getSurname() {
|
||||
return surname;
|
||||
}
|
||||
|
||||
public void setSurname(String surname) {
|
||||
if (surname != null)
|
||||
this.surname = surname.toLowerCase();
|
||||
else
|
||||
this.surname = new String();
|
||||
}
|
||||
|
||||
public String getFullname() {
|
||||
return fullname;
|
||||
}
|
||||
|
||||
public void setFullname(String fullname) {
|
||||
if (fullname != null)
|
||||
this.fullname = fullname.toLowerCase();
|
||||
else
|
||||
this.fullname = new String();
|
||||
}
|
||||
|
||||
public String getOid() {
|
||||
return oid;
|
||||
}
|
||||
|
||||
public void setOid(String oid) {
|
||||
this.oid = oid;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class ShuffleInfo implements Serializable {
|
||||
|
||||
private String aname;
|
||||
private String asurname;
|
||||
private String afullname;
|
||||
private String oname;
|
||||
private String osurname;
|
||||
private String ocreditName;
|
||||
private List<String> oOtherNames;
|
||||
private String orcid;
|
||||
private String id;
|
||||
private String pid;
|
||||
|
||||
public String getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(String pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public String getAname() {
|
||||
return aname;
|
||||
}
|
||||
|
||||
public void setAname(String aname) {
|
||||
this.aname = aname;
|
||||
}
|
||||
|
||||
public String getAsurname() {
|
||||
return asurname;
|
||||
}
|
||||
|
||||
public void setAsurname(String asurname) {
|
||||
this.asurname = asurname;
|
||||
}
|
||||
|
||||
public String getAfullname() {
|
||||
return afullname;
|
||||
}
|
||||
|
||||
public void setAfullname(String afullname) {
|
||||
this.afullname = afullname;
|
||||
}
|
||||
|
||||
public String getOname() {
|
||||
return oname;
|
||||
}
|
||||
|
||||
public void setOname(String oname) {
|
||||
this.oname = oname;
|
||||
}
|
||||
|
||||
public String getOsurname() {
|
||||
return osurname;
|
||||
}
|
||||
|
||||
public void setOsurname(String osurname) {
|
||||
this.osurname = osurname;
|
||||
}
|
||||
|
||||
public String getOcreditName() {
|
||||
return ocreditName;
|
||||
}
|
||||
|
||||
public void setOcreditName(String ocreditName) {
|
||||
this.ocreditName = ocreditName;
|
||||
}
|
||||
|
||||
public List<String> getoOtherNames() {
|
||||
return oOtherNames;
|
||||
}
|
||||
|
||||
public void setoOtherNames(List<String> oOtherNames) {
|
||||
this.oOtherNames = oOtherNames;
|
||||
}
|
||||
|
||||
public String getOrcid() {
|
||||
return orcid;
|
||||
}
|
||||
|
||||
public void setOrcid(String orcid) {
|
||||
this.orcid = orcid;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public static ShuffleInfo newInstance(String aname, String asurname, String afullname, String id) {
|
||||
ShuffleInfo si = new ShuffleInfo();
|
||||
si.afullname = afullname;
|
||||
si.aname = aname;
|
||||
si.asurname = asurname;
|
||||
si.id = id;
|
||||
return si;
|
||||
}
|
||||
|
||||
public static ShuffleInfo newInstance(String aname, String asurname, String afullname, String id, String oname,
|
||||
String osurname, String ocredtname, List<String> oOthername, String orcid, String pid) {
|
||||
ShuffleInfo si = new ShuffleInfo();
|
||||
si.afullname = afullname;
|
||||
si.aname = aname;
|
||||
si.asurname = asurname;
|
||||
si.id = id;
|
||||
si.oname = oname;
|
||||
si.osurname = osurname;
|
||||
si.ocreditName = ocredtname;
|
||||
si.oOtherNames = oOthername;
|
||||
si.orcid = orcid;
|
||||
si.pid = pid;
|
||||
return si;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
TUBYDI - Assistir Filmes e Series Online Grátis
|
||||
123Movies
|
||||
WATCH FULL MOVIE
|
||||
Movierulz
|
||||
Full Movie Online
|
||||
MOVIé WatcH
|
||||
The King of Staten Island 2020 Online For Free
|
||||
Watch Train to Busan 2 2020 online for free
|
||||
Sixth Sense Movie Novelization
|
||||
Film Complet streaming vf gratuit en ligne
|
||||
watch now free
|
||||
LIVE stream watch
|
||||
LIVE stream UFC
|
||||
RBC Heritage live stream
|
||||
MLBStreams Free
|
||||
NFL Live Stream
|
||||
Live Stream Free
|
||||
Royal Ascot 2020 Live Stream
|
||||
TV Shows Full Episodes Official
|
||||
FuboTV
|
||||
Gomovies
|
||||
Online Free Trial Access
|
||||
123watch
|
||||
DÜŞÜK HAPI
|
||||
Bebek Düşürme Yöntemleri
|
||||
WHATSAP İLETİŞİM
|
||||
Cytotec
|
||||
düşük hapı
|
|
@ -0,0 +1,21 @@
|
|||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source mdstore path",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "the target mdstore path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,26 @@
|
|||
[
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source mdstore path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "the target mdstore path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "m",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "the master name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "the isLookup URL",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,23 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,46 @@
|
|||
<workflow-app name="Import_Datacite_and_transform_to_OAF" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>mainPath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>The IS lookUp service endopoint</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="TransformJob"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="TransformJob">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>TransformJob</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
|
||||
<arg>--targetPath</arg><arg>${mainPath}/production/datacite_oaf</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,23 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,46 @@
|
|||
<workflow-app name="Datacite_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the working path of Datacite stores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path of Datacite ActionSet</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ExportDataset"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="ExportDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExportDataset</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,58 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>6G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,516 @@
|
|||
<workflow-app name="IRCDL Extention" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<action name="deleteoutputpath">
|
||||
<fs>
|
||||
<delete path='${outputPath}'/>
|
||||
<mkdir path='${outputPath}'/>
|
||||
<delete path='${workingDir}'/>
|
||||
<mkdir path='${workingDir}'/>
|
||||
</fs>
|
||||
<ok to="fork_prepare"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="fork_prepare">
|
||||
<path start="fork_prepare_result"/>
|
||||
<path start="prepare_orcid"/>
|
||||
</fork>
|
||||
|
||||
|
||||
<action name="prepare_orcid">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareResult</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareNormalizedOrcid</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${orcidInputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
|
||||
</spark>
|
||||
<ok to="join_fork"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="fork_prepare_result">
|
||||
<path start="prepare_publication"/>
|
||||
<path start="prepare_dataset"/>
|
||||
<path start="prepare_software"/>
|
||||
<path start="prepare_other"/>
|
||||
</fork>
|
||||
|
||||
<action name="prepare_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareResult</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
|
||||
<arg>--resultClass</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/publicationsWithOrcid</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait_prepare_result"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="prepare_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareResult</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
|
||||
<arg>--resultClass</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/datasetWithOrcid</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait_prepare_result"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="prepare_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareResult</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
|
||||
<arg>--resultClass</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/softwareWithOrcid</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait_prepare_result"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="prepare_other">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareResult</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
|
||||
<arg>--resultClass</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/otherWithOrcid</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="wait_prepare_result"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<join name="wait_prepare_result" to="normalize_result"/>
|
||||
|
||||
|
||||
<action name="normalize_result">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareResult</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareNormalizedResultSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/Normalized/</arg>
|
||||
</spark>
|
||||
<ok to="select_only_author_with_orcid"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="select_only_author_with_orcid">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareResult</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.SelectAuthorWithOrcidOnlySpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
|
||||
</spark>
|
||||
<ok to="fork_get_result_info"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="fork_get_result_info">
|
||||
<path start="get_result_instrepo"/>
|
||||
<path start="get_result_datacite"/>
|
||||
<path start="get_result_crossref"/>
|
||||
</fork>
|
||||
|
||||
|
||||
<action name="get_result_instrepo">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultFromInstRepo</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/InstRepo/</arg>
|
||||
<arg>--datasourcePath</arg><arg>${datasourcePath}</arg>
|
||||
</spark>
|
||||
<ok to="wait_res_info"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="get_result_datacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareDataciteSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/Datacite/</arg>
|
||||
</spark>
|
||||
<ok to="wait_res_info"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="get_result_crossref">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareCrossrefSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/Crossref/</arg>
|
||||
</spark>
|
||||
<ok to="wait_res_info"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<join name="wait_res_info" to="get_result_alltherest"/>
|
||||
|
||||
<action name="get_result_alltherest">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultAllTheRestSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/AllTheRest/</arg>
|
||||
<arg>--instRepoPath</arg><arg>${workingDir}/GRAPH/InstRepo/</arg>
|
||||
<arg>--datacitePath</arg><arg>${workingDir}/GRAPH/Datacite/</arg>
|
||||
<arg>--crossrefPath</arg><arg>${workingDir}/GRAPH/Crossref/</arg>
|
||||
</spark>
|
||||
<ok to="join_fork"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_fork" to="fork_get_wrong"/>
|
||||
|
||||
<fork name="fork_get_wrong">
|
||||
<path start="get_wrong_instrepo"/>
|
||||
<path start="get_wrong_datacite"/>
|
||||
<path start="get_wrong_crossref"/>
|
||||
<path start="get_wrong_alltherest"/>
|
||||
<path start="get_wrong_zenodo"/>
|
||||
<path start="get_wrong_figshare"/>
|
||||
<path start="get_wrong_dryad"/>
|
||||
</fork>
|
||||
|
||||
<action name="get_wrong_instrepo">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/InstRepo/</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/InstRepo/</arg>
|
||||
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
|
||||
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
|
||||
</spark>
|
||||
<ok to="jojn_wrong"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="get_wrong_datacite">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Datacite/allDatacite/</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/Datacite/</arg>
|
||||
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
|
||||
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
|
||||
</spark>
|
||||
<ok to="jojn_wrong"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="get_wrong_crossref">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Crossref/</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/Crossref/</arg>
|
||||
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
|
||||
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
|
||||
</spark>
|
||||
<ok to="jojn_wrong"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="get_wrong_alltherest">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/AllTheRest/</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/AllTheRest/</arg>
|
||||
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
|
||||
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
|
||||
</spark>
|
||||
<ok to="jojn_wrong"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="get_wrong_zenodo">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Datacite/Zenodo/</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/Zenodo/</arg>
|
||||
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
|
||||
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
|
||||
</spark>
|
||||
<ok to="jojn_wrong"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="get_wrong_figshare">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Datacite/Figshare/</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/Figshare/</arg>
|
||||
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
|
||||
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
|
||||
</spark>
|
||||
<ok to="jojn_wrong"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="get_wrong_dryad">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GetResultInstRepo</name>
|
||||
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Datacite/Dryad/</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/Dryad/</arg>
|
||||
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
|
||||
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
|
||||
</spark>
|
||||
<ok to="jojn_wrong"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<join name="jojn_wrong" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,35 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},{
|
||||
"paramName": "ir",
|
||||
"paramLongName": "instRepoPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},{
|
||||
"paramName": "dp",
|
||||
"paramLongName": "datacitePath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},{
|
||||
"paramName": "cp",
|
||||
"paramLongName": "crossrefPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,26 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "dp",
|
||||
"paramLongName": "datasourcePath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,20 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,26 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rc",
|
||||
"paramLongName": "resultClass",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "orcidPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "op",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "thepath of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ap",
|
||||
"paramLongName": "authorPath",
|
||||
"paramDescription": "thepath of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,50 @@
|
|||
|
||||
package eu.dentlib.dhp.aggregation;
|
||||
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.mockito.Mock;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public abstract class AbstractVocabularyTest {
|
||||
|
||||
@Mock
|
||||
protected ISLookUpService isLookUpService;
|
||||
|
||||
protected VocabularyGroup vocabularies;
|
||||
|
||||
public void setUpVocabulary() throws ISLookUpException, IOException {
|
||||
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
||||
.thenReturn(synonyms());
|
||||
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||
}
|
||||
|
||||
private static List<String> vocs() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/vocabulary/terms.txt")));
|
||||
}
|
||||
|
||||
private static List<String> synonyms() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/vocabulary/synonyms.txt")));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package eu.dnetlib.dhp.actionmanager.datacite
|
||||
|
||||
|
||||
import eu.dentlib.dhp.aggregation.AbstractVocabularyTest
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf
|
||||
import org.junit.jupiter.api.extension.ExtendWith
|
||||
import org.junit.jupiter.api.{BeforeEach, Test}
|
||||
import org.mockito.junit.jupiter.MockitoExtension
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
@ExtendWith(Array(classOf[MockitoExtension]))
|
||||
class DataciteToOAFTest extends AbstractVocabularyTest{
|
||||
|
||||
|
||||
@BeforeEach
|
||||
def setUp() :Unit = {
|
||||
|
||||
super.setUpVocabulary()
|
||||
}
|
||||
|
||||
@Test
|
||||
def testMapping() :Unit = {
|
||||
val record =Source.fromInputStream(getClass.getResourceAsStream("datacite.json")).mkString
|
||||
|
||||
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies )
|
||||
println (mapper.defaultPrettyPrintingWriter().writeValueAsString(res.head))
|
||||
|
||||
|
||||
}
|
||||
@Test
|
||||
def testDate():Unit = {
|
||||
|
||||
println(DataciteToOAFTransformation.fix_thai_date("01-01-2561","[dd-MM-yyyy]"))
|
||||
println(DataciteToOAFTransformation.fix_thai_date("2561-01-01","[yyyy-MM-dd]"))
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
|
||||
import eu.dnetlib.dhp.actionmanager.project.SparkUpdateProjectTest;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
|
||||
public class NormalizeOrcidTest {
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final ClassLoader cl = eu.dnetlib.dhp.ircdl_extention.NormalizeOrcidTest.class
|
||||
.getClassLoader();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(eu.dnetlib.dhp.ircdl_extention.NormalizeOrcidTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(eu.dnetlib.dhp.ircdl_extention.NormalizeOrcidTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(eu.dnetlib.dhp.ircdl_extention.NormalizeOrcidTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
// conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
// conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(NormalizeOrcidTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void normalizeOrcid() throws Exception {
|
||||
PrepareNormalizedOrcid
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/ircdl_extention/orcid_original.json")
|
||||
.getPath(),
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/orcidNormalized"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Orcid> tmp = sc
|
||||
.textFile(workingDir.toString() + "/orcidNormalized")
|
||||
.map(value -> OBJECT_MAPPER.readValue(value, Orcid.class));
|
||||
|
||||
tmp.foreach(v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v)));
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,280 @@
|
|||
|
||||
package eu.dnetlib.dhp.ircdl_extention;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.neethi.Assertion;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.jayway.jsonpath.WriteContext;
|
||||
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.Result;
|
||||
import eu.dnetlib.dhp.ircdl_extention.model.ShuffleInfo;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class WrongOrcidTest {
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalse() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.filterFunction(
|
||||
new Tuple2<>(Result.newInstance("veigas pires cristina"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"cristina", "veiga pires", "c veiga pires",
|
||||
Arrays.asList("c c veiga pires")))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalse2() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.filterFunction(
|
||||
new Tuple2<>(Result.newInstance("yushkevich p"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"paul", "yushkevich", "paul a yushkevich",
|
||||
new ArrayList<>()))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalse3() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.filterFunction(
|
||||
new Tuple2<>(Result.newInstance("ghosh ss"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"satrajit", "ghosh",
|
||||
"satra",
|
||||
Arrays.asList("satra", "satrajit s ghosh")))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidTrue() throws Exception {
|
||||
Assertions
|
||||
.assertFalse(
|
||||
Utils
|
||||
.filterFunction(
|
||||
new Tuple2<>(Result.newInstance("kluft lukas"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"satrajit", "ghosh",
|
||||
"satra",
|
||||
Arrays.asList("satra", "satrajit s ghosh")))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalse4() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.filterFunction(
|
||||
new Tuple2<>(Result.newInstance("schulz s a"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"sebastian", "schulz",
|
||||
"sebastian a schulz",
|
||||
new ArrayList<>()))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalse5() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.filterFunction(
|
||||
new Tuple2<>(Result.newInstance("domingues af"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"allysson", "domingues",
|
||||
"allysson f domingues",
|
||||
new ArrayList<>()))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalseConservative() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.conservativeFilterFunction(
|
||||
new Tuple2<>(Result.newInstance("veigas pires cristina"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"cristina", "veiga pires", "c veiga pires",
|
||||
Arrays.asList("c c veiga pires")))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalseConservative2() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.conservativeFilterFunction(
|
||||
new Tuple2<>(Result.newInstance("yushkevich p"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"paul", "yushkevich", "paul a yushkevich",
|
||||
new ArrayList<>()))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalseConservative3() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.conservativeFilterFunction(
|
||||
new Tuple2<>(Result.newInstance("ghosh ss"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"satrajit", "ghosh",
|
||||
"satra",
|
||||
Arrays.asList("satra", "satrajit s ghosh")))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidTrueConservative() throws Exception {
|
||||
Assertions
|
||||
.assertFalse(
|
||||
Utils
|
||||
.conservativeFilterFunction(
|
||||
new Tuple2<>(Result.newInstance("kluft lukas"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"satrajit", "ghosh",
|
||||
"satra",
|
||||
Arrays.asList("satra", "satrajit s ghosh")))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalseConservative4() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.conservativeFilterFunction(
|
||||
new Tuple2<>(Result.newInstance("schulz s a"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"sebastian", "schulz",
|
||||
"sebastian a schulz",
|
||||
new ArrayList<>()))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalseConservative5() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.conservativeFilterFunction(
|
||||
new Tuple2<>(Result.newInstance("domingues af"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"allysson", "domingues",
|
||||
"allysson f domingues",
|
||||
new ArrayList<>()))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidTrueConservative2() throws Exception {
|
||||
Assertions
|
||||
.assertFalse(
|
||||
Utils
|
||||
.conservativeFilterFunction(
|
||||
new Tuple2<>(Result.newInstance("figueiredo pontes lorena lobo de"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"moyses", "soares",
|
||||
"moyses antonio porto soares",
|
||||
new ArrayList<>()))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void wrongOrcidFalseConservative6() throws Exception {
|
||||
Assertions
|
||||
.assertTrue(
|
||||
Utils
|
||||
.conservativeFilterFunction(
|
||||
new Tuple2<>(Result.newInstance("da luz geraldo eduardo"),
|
||||
Orcid
|
||||
.newInstance(
|
||||
"geraldo", "luz jr",
|
||||
"luz jr g e",
|
||||
new ArrayList<>()))));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testShuffle() throws Exception {
|
||||
|
||||
List<ShuffleInfo> shuffleInfoList = new ArrayList<>();
|
||||
|
||||
shuffleInfoList
|
||||
.add(
|
||||
ShuffleInfo
|
||||
.newInstance(
|
||||
"Miriam", "Baglioni", "Miriam Baglioni", "50|fake_1",
|
||||
"Alessia", "Bardi", "", new ArrayList<String>(), "orcid_alessia"));
|
||||
shuffleInfoList.add(ShuffleInfo.newInstance("Alessia", "Bardi", "Alessia Bardi", "50|fake_1"));
|
||||
shuffleInfoList.add(ShuffleInfo.newInstance("Miriam", "Baglioni", "Miriam Baglioni", "50|fake_1"));
|
||||
shuffleInfoList
|
||||
.add(
|
||||
ShuffleInfo
|
||||
.newInstance(
|
||||
"Alessia", "Bardi", "Alessia Bardi", "50|fake_1",
|
||||
"Miriam", "Baglioni", "", new ArrayList<String>(), "orcid_miriam"));
|
||||
shuffleInfoList.add(ShuffleInfo.newInstance("Claudio", "Atzori", "Claudio Atzori", "50|fake_1"));
|
||||
|
||||
List<ShuffleInfo> tmp = shuffleInfoList
|
||||
.stream()
|
||||
.filter(e -> Optional.ofNullable(e.getOrcid()).isPresent())
|
||||
.collect(Collectors.toList());
|
||||
int count = 0;
|
||||
for (ShuffleInfo e : tmp) {
|
||||
if (verifyShuffle(e, shuffleInfoList))
|
||||
count++;
|
||||
|
||||
}
|
||||
|
||||
System.out.println(count);
|
||||
}
|
||||
|
||||
private boolean verifyShuffle(ShuffleInfo e, List<ShuffleInfo> shuffleInfoList) {
|
||||
return shuffleInfoList.stream().anyMatch(si -> {
|
||||
try {
|
||||
final Orcid orcid = Orcid
|
||||
.newInstance(e.getOname(), e.getOsurname(), e.getOcreditName(), e.getoOtherNames());
|
||||
return Utils
|
||||
.filterFunction(
|
||||
new Tuple2<>(Result.newInstance(si.getAfullname()), orcid));
|
||||
} catch (JsonProcessingException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
return false;
|
||||
});
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,29 @@
|
|||
{"otherNames": [], "inception": "2017-05-22T16:38:30.236Z", "surname": "hyy37", "mode": "Direct", "creditname": "void", "orcid": "0000-0002-8748-6992", "works": false, "name": "1380"}
|
||||
{"otherNames": [], "inception": "2017-05-25T12:50:48.761Z", "surname": "hyy75", "mode": "Direct", "creditname": "void", "orcid": "0000-0001-7773-1109", "works": false, "name": "2775"}
|
||||
{"otherNames": [], "inception": "2017-05-28T12:07:09.154Z", "surname": "hyy13", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-4728-6379", "works": false, "name": "434323"}
|
||||
{"otherNames": [], "inception": "2017-08-10T07:07:23.818Z", "surname": "hyy44", "mode": "Direct", "creditname": "void", "orcid": "0000-0001-9502-3093", "works": false, "name": "58"}
|
||||
{"otherNames": [], "inception": "2017-08-10T07:08:48.179Z", "surname": "hyy46", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-2933-0057", "works": false, "name": "60"}
|
||||
{"otherNames": ["pang x y", "pang xueyong"], "inception": "2014-10-13T03:26:21.741Z", "surname": "?", "mode": "API", "creditname": "void", "orcid": "0000-0002-7397-5824", "works": true, "name": "??"}
|
||||
{"otherNames": [], "inception": "2019-08-27T07:55:06.340Z", "surname": "therasa alphonsa", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0001-7205-6036", "works": false, "name": "a"}
|
||||
{"otherNames": ["minto"], "inception": "2020-08-02T06:33:18.620Z", "surname": "karim", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0001-6111-6742", "works": false, "name": "a k mohammad fazlul"}
|
||||
{"otherNames": [], "inception": "2014-05-01T09:13:11.783Z", "surname": "al-sammak", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0001-6646-4295", "works": false, "name": "a-imam"}
|
||||
{"otherNames": [], "inception": "2019-12-06T12:53:04.045Z", "surname": "hassan", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-2957-4641", "works": false, "name": "a-s.u."}
|
||||
{"otherNames": [], "inception": "2020-07-28T12:29:26.453Z", "surname": "ajakh", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0002-1081-8426", "works": false, "name": "a."}
|
||||
{"otherNames": [], "inception": "2017-01-10T12:35:05.016Z", "surname": "antol\u00ednez", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0002-5451-3421", "works": false, "name": "a. (ana)"}
|
||||
{"otherNames": [], "inception": "2018-08-20T05:00:15.964Z", "surname": "mahmudi", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-3187-941X", "works": false, "name": "a. aviv"}
|
||||
{"otherNames": [], "inception": "2017-05-13T01:03:58.949Z", "surname": "akanmu", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0001-6223-5428", "works": false, "name": "a. c."}
|
||||
{"otherNames": [], "inception": "2018-01-20T02:58:05.199Z", "surname": "inci", "mode": "Direct", "creditname": "void", "orcid": "0000-0002-0427-9745", "works": true, "name": "a. can"}
|
||||
{"otherNames": ["a. kim ryan"], "inception": "2014-10-24T23:06:43.544Z", "surname": "hayes", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0002-2055-8269", "works": true, "name": "a. kim"}
|
||||
{"otherNames": [], "inception": "2017-08-10T13:38:29.172Z", "surname": "bahadir", "mode": "Direct", "creditname": "void", "orcid": "0000-0002-4045-0001", "works": false, "name": "a. tugba"}
|
||||
{"otherNames": [], "inception": "2018-08-29T07:49:31.093Z", "surname": "rayna", "mode": "Direct", "creditname": "void", "orcid": "0000-0002-7916-2031", "works": false, "name": "a.brite"}
|
||||
{"otherNames": [], "inception": "2014-07-12T08:02:39.568Z", "surname": "kalyani", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2649-7126", "works": false, "name": "a.grace"}
|
||||
{"otherNames": [], "inception": "2018-07-21T12:00:22.042Z", "surname": "ahmed", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-0777-5848", "works": false, "name": "a.i. mahbub uddin"}
|
||||
{"otherNames": [], "inception": "2018-04-11T13:58:53.355Z", "surname": "a.kathirvel murugan", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2298-6301", "works": false, "name": "a.kathirvel murugan"}
|
||||
{"otherNames": [], "inception": "2017-08-31T11:35:48.559Z", "surname": "dar", "mode": "Direct", "creditname": "void", "orcid": "0000-0001-8781-6309", "works": false, "name": "a.rashid"}
|
||||
{"otherNames": [], "inception": "2014-08-26T00:25:30.968Z", "surname": "sayem", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2461-4667", "works": false, "name": "a.s.m."}
|
||||
{"otherNames": [], "inception": "2019-10-03T01:27:08.212Z", "surname": "conte", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2862-6139", "works": false, "name": "aaron"}
|
||||
{"otherNames": [], "inception": "2020-03-16T09:37:10.610Z", "surname": "rashmi", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-4754-5465", "works": false, "name": "aarthi rashmi b"}
|
||||
{"otherNames": [], "inception": "2017-02-28T19:01:59.146Z", "surname": "bhaskar", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0002-5794-1165", "works": false, "name": "aastha"}
|
||||
{"otherNames": [], "inception": "2020-04-07T18:10:50.922Z", "surname": "belhabib", "mode": "Direct", "creditname": "void", "orcid": "0000-0001-6086-0588", "works": false, "name": "abdelfettah"}
|
||||
{"otherNames": [], "inception": "2019-01-13T21:50:51.923Z", "surname": "laamani", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2055-2593", "works": false, "name": "abdellatif"}
|
||||
{"otherNames": ["fákē", "miñhō"], "inception": "2019-01-13T21:50:51.923Z", "surname": "laamani", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2055-2593", "works": false, "name": "abdellatif"}
|
|
@ -0,0 +1,4 @@
|
|||
{"aname":"Miriam", "asurname":"Baglioni", "afullname":"Miriam Baglioni","oname": "Alessia","osurname": "Bardi","ocreditName": "", "oOtherNames": [],"orcid": "orcid_alessia","id": "50|fake1"}
|
||||
{"aname":"Alessia", "asurname":"Bardi", "afullname":"Alessia Bardi","oname": null,"osurname": null,"ocreditName": null, "oOtherNames": null,"orcid": null,"id": "50|fake1"}
|
||||
{"aname":"Claudio", "asurname":"Atzori", "afullname":"Claudio Atzori","oname": null,"osurname": null,"ocreditName": null, "oOtherNames": null,"orcid": null,"id": "50|fake1"}
|
||||
{"aname":"Alessia", "asurname":"Bardi", "afullname":"Alessia Bardi","oname": "Miriam","osurname": "Baglioni","ocreditName": "", "oOtherNames": [],"orcid": "orcid_miriam","id": "50|fake1"}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -26,6 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
|
|||
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
|
@ -144,7 +145,7 @@ public class ConversionUtils {
|
|||
.filter(pid -> pid != null)
|
||||
.filter(pid -> pid.getQualifier() != null)
|
||||
.filter(pid -> pid.getQualifier().getClassid() != null)
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
|
||||
.map(pid -> pid.getValue())
|
||||
.map(pid -> cleanOrcid(pid))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-contextpropagation</artifactId>
|
||||
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpmime</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.elasticsearch</groupId>
|
||||
<artifactId>elasticsearch-hadoop</artifactId>
|
||||
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-provision-scholexplorer</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</project>
|
|
@ -0,0 +1,77 @@
|
|||
package eu.dnetlib.dhp.contextpropagation;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
|
||||
public class Costants implements Serializable {
|
||||
|
||||
private static Map<String, PropagationUse> publicationDatasetSemantics = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
publicationDatasetSemantics.put("issupplementedby", PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
publicationDatasetSemantics.put("cites", PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
|
||||
publicationDatasetSemantics.put("describes", PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
publicationDatasetSemantics.put("references", PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
|
||||
publicationDatasetSemantics.put("documents", PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Map<String, PropagationUse> datasetDatasetSemantics = Maps.newHashMap();
|
||||
|
||||
static{
|
||||
datasetDatasetSemantics.put("isdescribedby",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("iscitedby",PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("cites",PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("issupplementedby",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("issupplementto",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("iscontinuedby",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("continues",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("hasversion",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("isversionof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("isnewversionof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("ispreviousversionof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("ispartof",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("haspart",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("references",PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("isreferencedby",PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("documents",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("isdocumentedby",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("isvariantformof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("isoriginalformof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("isidenticalto",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("obsoletes",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
datasetDatasetSemantics.put("isobsoletedby",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
|
||||
|
||||
}
|
||||
|
||||
public static Map<String, PropagationUse> getPublicationDatasetSemantics() {
|
||||
return publicationDatasetSemantics;
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, PropagationUse> getDatasetDatasetSemantics() {
|
||||
return datasetDatasetSemantics;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static boolean containedInPubSem(String sem){
|
||||
return publicationDatasetSemantics.containsKey(sem);
|
||||
}
|
||||
|
||||
public static boolean containedInDatsSem(String sem){
|
||||
return datasetDatasetSemantics.containsKey(sem);
|
||||
}
|
||||
|
||||
public static PropagationUse getPublicationValue(String sem){
|
||||
return publicationDatasetSemantics.get(sem);
|
||||
}
|
||||
|
||||
public static PropagationUse getDatasetValue(String sem){
|
||||
return datasetDatasetSemantics.get(sem);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package eu.dnetlib.dhp.contextpropagation;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public class DatasetPropagationStructure implements Serializable {
|
||||
|
||||
private Map<String, PropagationUse> propagation;
|
||||
|
||||
public Map<String, PropagationUse> getPropagation() {
|
||||
return propagation;
|
||||
}
|
||||
|
||||
public void add(String key, PropagationUse value){
|
||||
propagation.put(key, value);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.dhp.contextpropagation;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class Node implements Serializable {
|
||||
private String id;
|
||||
private List<String> publisher;
|
||||
|
||||
public List<String> getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(List<String> publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public static Node newInstance(String id, List<String> publisher){
|
||||
Node n = new Node();
|
||||
n.id = id;
|
||||
n.publisher = publisher;
|
||||
return n;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
|
||||
object PropagationAggregator {
|
||||
|
||||
def getDatasetAggregator(): Aggregator[(String, PropagationStructure), PropagationStructure, PropagationStructure] = new Aggregator[(String, PropagationStructure), PropagationStructure, PropagationStructure]{
|
||||
|
||||
override def zero: PropagationStructure = new PropagationStructure()
|
||||
|
||||
override def reduce(b: PropagationStructure, a: (String, PropagationStructure)): PropagationStructure = {
|
||||
b.mergeFrom(a._2)
|
||||
|
||||
|
||||
}
|
||||
|
||||
override def merge(wx: PropagationStructure, wy: PropagationStructure): PropagationStructure = {
|
||||
|
||||
wx.mergeFrom(wy)
|
||||
|
||||
}
|
||||
override def finish(reduction: PropagationStructure): PropagationStructure = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PropagationStructure] =
|
||||
Encoders.kryo(classOf[PropagationStructure])
|
||||
|
||||
override def outputEncoder: Encoder[PropagationStructure] =
|
||||
Encoders.kryo(classOf[PropagationStructure])
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package eu.dnetlib.dhp.contextpropagation;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PropagationStructure implements Serializable {
|
||||
private Map<String, List<PropagationUse>> propagation;
|
||||
|
||||
public Map<String, List<PropagationUse>> getPropagation() {
|
||||
return propagation;
|
||||
}
|
||||
|
||||
public void add(String key, List<PropagationUse> value){
|
||||
propagation.put(key, value);
|
||||
}
|
||||
|
||||
public void setPropagation(Map<String, List<PropagationUse>> propagation) {
|
||||
this.propagation = propagation;
|
||||
}
|
||||
|
||||
private void mergeList(PropagationUse use, List<PropagationUse> acc){
|
||||
for(PropagationUse pu: acc){
|
||||
if (use.getUse().equals(pu.getUse())){
|
||||
pu.getPath().addAll(use.getPath());
|
||||
if (Integer.valueOf(pu.getWeight()) < Integer.valueOf(use.getWeight())){
|
||||
pu.setWeight(use.getWeight());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
acc.add(use);
|
||||
}
|
||||
|
||||
public PropagationStructure mergeFrom(PropagationStructure ps){
|
||||
for(String key : ps.propagation.keySet()){
|
||||
if (propagation.containsKey(key)){
|
||||
ps.propagation.get(key).forEach( use -> mergeList(use, propagation.get(key)));
|
||||
}else{
|
||||
propagation.put(key, ps.propagation.get(key).stream().map(pu -> PropagationUse.copyInstance(pu)).collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package eu.dnetlib.dhp.contextpropagation;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class PropagationUse implements Serializable {
|
||||
private String use;
|
||||
private String weight;
|
||||
private Set<String> path;
|
||||
|
||||
public String getUse() {
|
||||
return use;
|
||||
}
|
||||
|
||||
public void setUse(String use) {
|
||||
this.use = use;
|
||||
}
|
||||
|
||||
public String getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void setWeight(String weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public Set<String> getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
public void setPath(Set<String> path) {
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
public static PropagationUse newInstance(String use, String weight, Set<String> path){
|
||||
PropagationUse pu = new PropagationUse();
|
||||
pu.use = use;
|
||||
pu.weight = weight;
|
||||
pu.path = path;
|
||||
return pu;
|
||||
}
|
||||
|
||||
public static PropagationUse copyInstance(PropagationUse use){
|
||||
PropagationUse pu = new PropagationUse();
|
||||
pu.path = use.path;
|
||||
pu.weight = use.weight;
|
||||
pu.use = use.use;
|
||||
return pu;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
package eu.dnetlib.dhp.contextpropagation;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class RelationPropagation implements Serializable {
|
||||
private Node source;
|
||||
private Node target;
|
||||
private String semantics;
|
||||
|
||||
public Node getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(Node source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public Node getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public void setTarget(Node target) {
|
||||
this.target = target;
|
||||
}
|
||||
|
||||
public String getSemantics() {
|
||||
return semantics;
|
||||
}
|
||||
|
||||
public void setSemantics(String semantics) {
|
||||
this.semantics = semantics;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,127 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import java.util
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
|
||||
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixEntityId}
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkContextPropagation {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertDatasetToJson.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/dataset2Json.json")))
|
||||
parser.parseArgument(args)
|
||||
val conf = new SparkConf
|
||||
val spark = SparkSession.builder.config(conf).appName(SparkConvertDatasetToJson.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
|
||||
implicit val mapEncoderPub: Encoder[PropagationStructure] = Encoders.kryo[PropagationStructure]
|
||||
implicit val mapEncoderDats: Encoder[DatasetPropagationStructure] = Encoders.kryo[DatasetPropagationStructure]
|
||||
implicit val tupleForPropagation: Encoder[(String, PropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
implicit val tupleForPropagationDars: Encoder[(String, DatasetPropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderDats)
|
||||
implicit val stringEncoder: Encoder[String] = Encoders.STRING
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
|
||||
def getPublisherList(item: List[ScholixEntityId]) : util.List[String] =
|
||||
{
|
||||
|
||||
item.map(entry => entry.getName).asJava
|
||||
}
|
||||
|
||||
def propagateDataset (item: ((String, PropagationStructure), (String, DatasetPropagationStructure))) : (String, PropagationStructure) = {
|
||||
val propagation = item._1._2.getPropagation.asScala
|
||||
val dsprob : DatasetPropagationStructure= item._2._2
|
||||
val source = dsprob.getPropagation.keySet().iterator().next()
|
||||
val dic = new scala.collection.mutable.HashMap[String, util.List[PropagationUse]]
|
||||
|
||||
propagation.keysIterator.foreach(key => {
|
||||
val entries = propagation.get(key).get.asScala
|
||||
|
||||
entries.foreach(entry => {
|
||||
if((entry.getUse == dsprob.getPropagation.get(source).getUse || dsprob.getPropagation.get(source).getUse == "proxy")
|
||||
&& !entry.getPath.contains(source)) {
|
||||
var new_p = Integer.valueOf(entry.getWeight) * Integer.valueOf(dsprob.getPropagation.get(source).getWeight)
|
||||
if (new_p > 0.3){
|
||||
var newentry : PropagationUse = PropagationUse.copyInstance(entry)
|
||||
newentry.setWeight(String.valueOf(new_p))
|
||||
newentry.getPath.add(source)
|
||||
|
||||
dic(key).add(newentry)
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
var ps: PropagationStructure = new PropagationStructure
|
||||
ps.setPropagation(dic.asJava)
|
||||
(source, ps)
|
||||
|
||||
}
|
||||
|
||||
spark.read.load(s"$workingPath/summary").as[ScholixSummary]
|
||||
.map(s => new ObjectMapper().writeValueAsString(s))(Encoders.STRING)
|
||||
.rdd.repartition(500).saveAsTextFile(s"$workingPath/summary_json", classOf[GzipCodec])
|
||||
|
||||
val allowedRelations : Dataset[RelationPropagation] = spark.read.load(s"$workingPath/scholix").as[Scholix]
|
||||
.filter(s => !s.getSource().getDnetIdentifier().substring(0,2).equals("70") )
|
||||
.filter(s => !s.getTarget().getDnetIdentifier().substring(0,2).equals("70"))
|
||||
.map(s => {
|
||||
val rp = new RelationPropagation
|
||||
rp.setSource(Node.newInstance(s.getSource.getDnetIdentifier, getPublisherList(s.getSource.getPublisher.asScala.toList)))
|
||||
rp.setTarget(Node.newInstance(s.getTarget.getDnetIdentifier, getPublisherList(s.getTarget.getPublisher.asScala.toList)))
|
||||
rp.setSemantics(s.getRelationship.getName)
|
||||
rp
|
||||
})
|
||||
|
||||
val pubs_rel : Dataset[RelationPropagation] = allowedRelations
|
||||
.filter(r => r.getSource.getId.substring(0,2) == "50"
|
||||
&& r.getTarget.getId.substring(0,2) == "60"
|
||||
&& Costants.containedInPubSem(r.getSemantics))
|
||||
val dats_rel : Dataset[RelationPropagation] = allowedRelations
|
||||
.filter(r => r.getSource.getId.substring(0,2) == "60"
|
||||
&& r.getTarget.getId.substring(0,2) == "60"
|
||||
&& Costants.containedInDatsSem(r.getSemantics)
|
||||
&& r.getSource.getId != r.getTarget.getId)
|
||||
|
||||
val publication_dataset : Dataset[(String, PropagationStructure)] = pubs_rel.map(r => {
|
||||
val ps = new PropagationStructure
|
||||
|
||||
val pv : List[PropagationUse] = List(PropagationUse.copyInstance(Costants.getPublicationValue(r.getSemantics)))
|
||||
ps.add(r.getSource.getId, pv.asJava)
|
||||
(r.getTarget.getId, ps)
|
||||
|
||||
})
|
||||
|
||||
val dataset_dataset : Dataset[(String, DatasetPropagationStructure)] = dats_rel.map(r => {
|
||||
val ps = new DatasetPropagationStructure
|
||||
|
||||
ps.add(r.getTarget.getId, PropagationUse.copyInstance(Costants.getDatasetValue(r.getSemantics)))
|
||||
(r.getSource.getId, ps)
|
||||
|
||||
})
|
||||
|
||||
|
||||
val pl1 : Dataset[(String, PropagationStructure)] = publication_dataset.groupByKey(_._1)
|
||||
.agg(PropagationAggregator.getDatasetAggregator().toColumn)
|
||||
|
||||
|
||||
val pl2_step1 : Dataset [(String, PropagationStructure)] = pl1.joinWith(dataset_dataset, pl1("_1").equalTo(dataset_dataset("_1")))
|
||||
.map(propagateDataset)
|
||||
|
||||
val pl2 : Dataset [(String, PropagationStructure)] = pl1.union(pl2_step1).groupByKey(_._1)
|
||||
.agg(PropagationAggregator.getDatasetAggregator().toColumn)
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -18,7 +18,7 @@ import eu.dnetlib.dhp.schema.oaf.Field;
|
|||
|
||||
public class DatePicker {
|
||||
|
||||
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
|
||||
private static final String DATE_PATTERN = "^(\\d{4})-(\\d{2})-(\\d{2})";
|
||||
private static final String DATE_DEFAULT_SUFFIX = "01-01";
|
||||
private static final int YEAR_LB = 1300;
|
||||
private static final int YEAR_UB = Year.now().getValue() + 5;
|
||||
|
@ -28,6 +28,7 @@ public class DatePicker {
|
|||
final Map<String, Integer> frequencies = dateofacceptance
|
||||
.parallelStream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(d -> substringBefore(d, "T"))
|
||||
.collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));
|
||||
|
||||
if (frequencies.isEmpty()) {
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.clearspring.analytics.util.Lists;
|
||||
|
||||
public class DatePickerTest {
|
||||
|
||||
Collection<String> dates = Lists.newArrayList();
|
||||
|
||||
@Test
|
||||
public void testPickISO() {
|
||||
dates.add("2016-01-01T12:00:00Z");
|
||||
dates.add("2016-06-16T12:00:00Z");
|
||||
dates.add("2020-01-01T12:00:00Z");
|
||||
dates.add("2020-10-01T12:00:00Z");
|
||||
assertEquals("2020-10-01", DatePicker.pick(dates).getValue());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPickSimple() {
|
||||
dates.add("2016-01-01");
|
||||
dates.add("2016-06-16");
|
||||
dates.add("2020-01-01");
|
||||
dates.add("2020-10-01");
|
||||
assertEquals("2020-10-01", DatePicker.pick(dates).getValue());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPickFrequent() {
|
||||
dates.add("2016-02-01");
|
||||
dates.add("2016-02-01");
|
||||
dates.add("2016-02-01");
|
||||
dates.add("2020-10-01");
|
||||
assertEquals("2016-02-01", DatePicker.pick(dates).getValue());
|
||||
}
|
||||
|
||||
}
|
|
@ -5,6 +5,7 @@ import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue,
|
|||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
@ -28,7 +29,6 @@ object DoiBoostMappingUtil {
|
|||
//STATIC STRING
|
||||
val MAG = "microsoft"
|
||||
val MAG_NAME = "Microsoft Academic Graph"
|
||||
val ORCID = "orcid"
|
||||
val ORCID_PENDING = "orcid_pending"
|
||||
val CROSSREF = "Crossref"
|
||||
val UNPAYWALL = "UnpayWall"
|
||||
|
@ -37,8 +37,6 @@ object DoiBoostMappingUtil {
|
|||
val doiBoostNSPREFIX = "doiboost____"
|
||||
val OPENAIRE_PREFIX = "openaire____"
|
||||
val SEPARATOR = "::"
|
||||
val DNET_LANGUAGES = "dnet:languages"
|
||||
val PID_TYPES = "dnet:pid_types"
|
||||
|
||||
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
||||
|
||||
|
@ -326,8 +324,8 @@ object DoiBoostMappingUtil {
|
|||
def createORIDCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue(ORCID)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
|
||||
cf.setValue(ModelConstants.ORCID_DS)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ModelConstants.ORCID))
|
||||
cf
|
||||
|
||||
}
|
||||
|
|
|
@ -87,7 +87,7 @@ case object Crossref2Oaf {
|
|||
|
||||
//MAPPING Crossref DOI into PID
|
||||
val doi: String = (json \ "DOI").extract[String]
|
||||
result.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
|
||||
result.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
|
||||
//MAPPING Crossref DOI into OriginalId
|
||||
//and Other Original Identifier of dataset like clinical-trial-number
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
|
||||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.net.URI;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class ExtractCrossrefRecords {
|
||||
public static void main(String[] args) throws Exception {
|
||||
String hdfsServerUri;
|
||||
String workingPath;
|
||||
String crossrefFileNameTarGz;
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
ExtractCrossrefRecords.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader.json")));
|
||||
parser.parseArgument(args);
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
workingPath = parser.get("workingPath");
|
||||
crossrefFileNameTarGz = parser.get("crossrefFileNameTarGz");
|
||||
|
||||
Path hdfsreadpath = new Path(hdfsServerUri.concat(workingPath).concat(crossrefFileNameTarGz));
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
|
||||
FSDataInputStream crossrefFileStream = fs.open(hdfsreadpath);
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||
new GzipCompressorInputStream(crossrefFileStream))) {
|
||||
TarArchiveEntry entry = null;
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
if (entry.isDirectory()) {
|
||||
} else {
|
||||
try (
|
||||
FSDataOutputStream out = fs
|
||||
.create(new Path(workingPath.concat("filess/").concat(entry.getName()).concat(".gz")));
|
||||
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
|
||||
|
||||
IOUtils.copy(tais, gzipOs);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
Log.info("Crossref dump reading completed");
|
||||
|
||||
}
|
||||
}
|
|
@ -33,9 +33,9 @@ object SparkMapDumpIntoOAF {
|
|||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
|
||||
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
import spark.implicits._
|
||||
|
||||
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
|
||||
.flatMap(k => Crossref2Oaf.convert(k.json))
|
||||
|
|
|
@ -188,7 +188,7 @@ case object ConversionUtil {
|
|||
val authors = inputParams._2
|
||||
|
||||
val pub = new Publication
|
||||
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
|
||||
|
||||
//Set identifier as 50|doiboost____::md5(DOI)
|
||||
|
@ -247,7 +247,7 @@ case object ConversionUtil {
|
|||
val description = inputParams._2
|
||||
|
||||
val pub = new Publication
|
||||
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
|
||||
|
||||
//Set identifier as 50 | doiboost____::md5(DOI)
|
||||
|
|
|
@ -30,7 +30,6 @@ public class PublicationToOaf implements Serializable {
|
|||
|
||||
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
||||
|
||||
public static final String ORCID = "ORCID";
|
||||
public final static String orcidPREFIX = "orcid_______";
|
||||
public static final String OPENAIRE_PREFIX = "openaire____";
|
||||
public static final String SEPARATOR = "::";
|
||||
|
@ -69,7 +68,9 @@ public class PublicationToOaf implements Serializable {
|
|||
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
||||
|
||||
{
|
||||
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||
put(
|
||||
ModelConstants.ORCID,
|
||||
new Pair<>(ModelConstants.ORCID_DS, OPENAIRE_PREFIX + SEPARATOR + ModelConstants.ORCID));
|
||||
|
||||
}
|
||||
};
|
||||
|
@ -102,8 +103,6 @@ public class PublicationToOaf implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public static final String PID_TYPES = "dnet:pid_types";
|
||||
|
||||
public Oaf generatePublicationActionsFromJson(final String json) {
|
||||
try {
|
||||
if (parsedPublications != null) {
|
||||
|
@ -138,8 +137,8 @@ public class PublicationToOaf implements Serializable {
|
|||
mapQualifier(
|
||||
"sysimport:actionset:orcidworks-no-doi",
|
||||
"sysimport:actionset:orcidworks-no-doi",
|
||||
"dnet:provenanceActions",
|
||||
"dnet:provenanceActions"));
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
publication.setDataInfo(dataInfo);
|
||||
|
||||
publication.setLastupdatetimestamp(new Date().getTime());
|
||||
|
@ -159,7 +158,9 @@ public class PublicationToOaf implements Serializable {
|
|||
publication
|
||||
.getExternalReference()
|
||||
.add(
|
||||
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
|
||||
convertExtRef(
|
||||
extId, classid, classname, ModelConstants.DNET_PID_TYPES,
|
||||
ModelConstants.DNET_PID_TYPES));
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -505,24 +506,21 @@ public class PublicationToOaf implements Serializable {
|
|||
|
||||
private KeyValue createCollectedFrom() {
|
||||
KeyValue cf = new KeyValue();
|
||||
cf.setValue(ORCID);
|
||||
cf.setValue(ModelConstants.ORCID_DS);
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
||||
return cf;
|
||||
}
|
||||
|
||||
private KeyValue createHostedBy() {
|
||||
KeyValue hb = new KeyValue();
|
||||
hb.setValue("Unknown Repository");
|
||||
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
|
||||
return hb;
|
||||
return ModelConstants.UNKNOWN_REPOSITORY;
|
||||
}
|
||||
|
||||
private StructuredProperty mapAuthorId(String orcidId) {
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(orcidId);
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(ORCID.toLowerCase());
|
||||
q.setClassname(ORCID.toLowerCase());
|
||||
q.setClassid(ModelConstants.ORCID);
|
||||
q.setClassname(ModelConstants.ORCID_CLASSNAME);
|
||||
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||
q.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||
sp.setQualifier(q);
|
||||
|
@ -535,8 +533,8 @@ public class PublicationToOaf implements Serializable {
|
|||
mapQualifier(
|
||||
"sysimport:crosswalk:entityregistry",
|
||||
"Harvested",
|
||||
"dnet:provenanceActions",
|
||||
"dnet:provenanceActions"));
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS));
|
||||
sp.setDataInfo(dataInfo);
|
||||
return sp;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.doiboost.uw
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Publication}
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
|
@ -32,7 +33,7 @@ object UnpayWallToOAF {
|
|||
val is_oa = (json\ "is_oa").extract[Boolean]
|
||||
|
||||
val oaLocation:OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null)
|
||||
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
|
||||
pub.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
|
||||
pub.setId(generateIdentifier(pub, doi.toLowerCase))
|
||||
|
||||
pub.setCollectedfrom(List(createUnpayWallCollectedFrom()).asJava)
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"crossrefFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||
{"paramName":"issm", "paramLongName":"isSparkSessionManaged", "paramDescription": "the name of the activities orcid file", "paramRequired": false}
|
||||
|
||||
]
|
|
@ -0,0 +1,68 @@
|
|||
<workflow-app name="read Crossref dump from HDFS" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<!-- <property>-->
|
||||
<!-- <name>workingPath</name>-->
|
||||
<!-- <description>the working dir base path</description>-->
|
||||
<!-- </property>-->
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ReadCrossRefDump"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ReadCrossRefDump">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</main-class>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
<arg>--workingPath</arg><arg>/data/doiboost/crossref/</arg>
|
||||
<arg>--crossrefFileNameTarGz</arg><arg>crossref.tar.gz</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="SparkReadCrossRefDump">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>SparkReadCrossRefDump</name>
|
||||
<class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--conf spark.dynamicAllocation.enabled=true
|
||||
--conf spark.dynamicAllocation.maxExecutors=20
|
||||
--executor-memory=6G
|
||||
--driver-memory=7G
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
</spark-opts>
|
||||
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
|
||||
<arg>--workingPath</arg><arg>/data/doiboost/crossref/</arg>
|
||||
<arg>--crossrefFileNameTarGz</arg><arg>crossref.tar.gz</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,372 +0,0 @@
|
|||
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorIntersectionMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
|
||||
|
||||
<!-- Itersection Parameters -->
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>hostedByMapPath</name>
|
||||
<description>the hostedByMap Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the Path of the sequence file action set</description>
|
||||
</property>
|
||||
|
||||
|
||||
<!-- Crossref Parameters -->
|
||||
<property>
|
||||
<name>inputPathCrossref</name>
|
||||
<description>the Crossref input path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>crossrefTimestamp</name>
|
||||
<description>Timestamp for the Crossref incremental Harvesting</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esServer</name>
|
||||
<description>elasticsearch server url for the Crossref Harvesting</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esIndex</name>
|
||||
<description>elasticsearch index name for the Crossref Harvesting</description>
|
||||
</property>
|
||||
|
||||
<!-- MAG Parameters -->
|
||||
<property>
|
||||
<name>MAGDumpPath</name>
|
||||
<description>the MAG dump working path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>inputPathMAG</name>
|
||||
<description>the MAG working path</description>
|
||||
</property>
|
||||
|
||||
|
||||
<!-- UnpayWall Parameters -->
|
||||
<property>
|
||||
<name>inputPathUnpayWall</name>
|
||||
<description>the UnpayWall working path</description>
|
||||
</property>
|
||||
|
||||
<!-- ORCID Parameters -->
|
||||
<property>
|
||||
<name>inputPathOrcid</name>
|
||||
<description>the ORCID input path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>workingPathOrcid</name>
|
||||
<description>the ORCID working path</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resume_from"/>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
|
||||
<case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
|
||||
<case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
|
||||
<case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
|
||||
<case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
|
||||
<case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
|
||||
<case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
|
||||
<default to="ImportCrossRef"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ImportCrossRef">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
|
||||
<arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--esServer</arg><arg>${esServer}</arg>
|
||||
<arg>--esIndex</arg><arg>${esIndex}</arg>
|
||||
<arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
|
||||
</java>
|
||||
<ok to="GenerateCrossrefDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<!-- CROSSREF SECTION -->
|
||||
|
||||
<action name="GenerateCrossrefDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateCrossrefDataset</name>
|
||||
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="RenameDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RenameDataset">
|
||||
<fs>
|
||||
<delete path="${inputPathCrossref}/crossref_ds"/>
|
||||
<move source="${inputPathCrossref}/crossref_ds_updated"
|
||||
target="${inputPathCrossref}/crossref_ds"/>
|
||||
</fs>
|
||||
<ok to="ResetMagWorkingPath"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<!-- MAG SECTION -->
|
||||
<action name="ResetMagWorkingPath">
|
||||
<fs>
|
||||
<delete path="${inputPathMAG}/dataset"/>
|
||||
<delete path="${inputPathMAG}/process"/>
|
||||
</fs>
|
||||
<ok to="ConvertMagToDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ConvertMagToDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Mag to Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${MAGDumpPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="ConvertCrossrefToOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ConvertCrossrefToOAF">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ConvertCrossrefToOAF</name>
|
||||
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="ProcessMAG"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ProcessMAG">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Mag to OAF Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.mag.SparkProcessMAG</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorIntersectionMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="ProcessUW"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- UnpayWall SECTION -->
|
||||
|
||||
<action name="ProcessUW">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert UnpayWall to Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}/uwPublication</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="ProcessORCID"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- ORCID SECTION -->
|
||||
<action name="ProcessORCID">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert ORCID to Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPathOrcid}</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}/orcidPublication</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="CreateDOIBoost"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- INTERSECTION SECTION-->
|
||||
<action name="CreateDOIBoost">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create DOIBoost Infospace</name>
|
||||
<class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorIntersectionMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
|
||||
<arg>--affiliationPath</arg><arg>${inputPathMAG}/dataset/Affiliations</arg>
|
||||
<arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/dataset/PaperAuthorAffiliations</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="GenerateActionSet"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="GenerateActionSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Generate DOIBoost ActionSet</name>
|
||||
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
|
||||
<arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
|
||||
<arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
|
||||
<arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
|
||||
<arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
|
||||
<arg>--sFilePath</arg><arg>${outputPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
File diff suppressed because one or more lines are too long
|
@ -117,6 +117,12 @@
|
|||
<artifactId>json4s-jackson_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-provision-scholexplorer</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.contextpropagation.model.PropagationUse;
|
||||
|
||||
public class Costants implements Serializable {
|
||||
|
||||
private static Map<String, PropagationUse> publicationDatasetSemantics = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
publicationDatasetSemantics
|
||||
.put("issupplementedby", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
publicationDatasetSemantics.put("cites", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
|
||||
publicationDatasetSemantics.put("describes", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
publicationDatasetSemantics.put("references", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
|
||||
publicationDatasetSemantics.put("documents", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
|
||||
}
|
||||
|
||||
private static Map<String, PropagationUse> datasetDatasetSemantics = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
datasetDatasetSemantics.put("isdescribedby", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("iscitedby", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("cites", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("issupplementedby", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("issupplementto", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("iscontinuedby", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("continues", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("hasversion", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("isversionof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("isnewversionof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics
|
||||
.put("ispreviousversionof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("ispartof", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("haspart", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("references", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("isreferencedby", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("documents", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("isdocumentedby", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("isvariantformof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("isoriginalformof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("isidenticalto", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("obsoletes", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
datasetDatasetSemantics.put("isobsoletedby", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
|
||||
|
||||
}
|
||||
|
||||
public static Map<String, PropagationUse> getPublicationDatasetSemantics() {
|
||||
return publicationDatasetSemantics;
|
||||
}
|
||||
|
||||
public static Map<String, PropagationUse> getDatasetDatasetSemantics() {
|
||||
return datasetDatasetSemantics;
|
||||
}
|
||||
|
||||
public static boolean containedInPubSem(String sem) {
|
||||
return publicationDatasetSemantics.containsKey(sem);
|
||||
}
|
||||
|
||||
public static boolean containedInDatsSem(String sem) {
|
||||
return datasetDatasetSemantics.containsKey(sem);
|
||||
}
|
||||
|
||||
public static PropagationUse getPublicationValue(String sem) {
|
||||
return publicationDatasetSemantics.get(sem);
|
||||
}
|
||||
|
||||
public static PropagationUse getDatasetValue(String sem) {
|
||||
return datasetDatasetSemantics.get(sem);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, PropagationStructure}
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
|
||||
object PropagationAggregator {
|
||||
|
||||
def getDatasetAggregator(): Aggregator[(String, PropagationStructure), PropagationStructure, PropagationStructure] = new Aggregator[(String, PropagationStructure), PropagationStructure, PropagationStructure]{
|
||||
|
||||
override def zero: PropagationStructure = new PropagationStructure()
|
||||
|
||||
override def reduce(b: PropagationStructure, a: (String, PropagationStructure)): PropagationStructure = {
|
||||
b.mergeFrom(a._2)
|
||||
|
||||
}
|
||||
|
||||
override def merge(wx: PropagationStructure, wy: PropagationStructure): PropagationStructure = {
|
||||
|
||||
wx.mergeFrom(wy)
|
||||
|
||||
}
|
||||
override def finish(reduction: PropagationStructure): PropagationStructure = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[PropagationStructure] =
|
||||
Encoders.kryo(classOf[PropagationStructure])
|
||||
|
||||
override def outputEncoder: Encoder[PropagationStructure] =
|
||||
Encoders.kryo(classOf[PropagationStructure])
|
||||
|
||||
}
|
||||
|
||||
|
||||
def mergeEnrichedEntries(): Aggregator[(String, EnrichedEntries), EnrichedEntries, EnrichedEntries] = new Aggregator[(String, EnrichedEntries), EnrichedEntries, EnrichedEntries]{
|
||||
|
||||
override def zero: EnrichedEntries = new EnrichedEntries()
|
||||
|
||||
override def reduce(b: EnrichedEntries, a: (String, EnrichedEntries)): EnrichedEntries = {
|
||||
|
||||
b.mergeWith(a._2)
|
||||
|
||||
}
|
||||
|
||||
override def merge(wx: EnrichedEntries, wy: EnrichedEntries): EnrichedEntries = {
|
||||
|
||||
wx.mergeWith(wy)
|
||||
|
||||
}
|
||||
override def finish(reduction: EnrichedEntries): EnrichedEntries = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[EnrichedEntries] =
|
||||
Encoders.kryo(classOf[EnrichedEntries])
|
||||
|
||||
override def outputEncoder: Encoder[EnrichedEntries] =
|
||||
Encoders.kryo(classOf[EnrichedEntries])
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,250 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import java.util
|
||||
|
||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{DatasetPropagationStructure, EnrichedEntries, MapSxOA, Node, PropagationStructure, PropagationUse, RelationPropagation}
|
||||
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixEntityId}
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.{SchemeValue, ScholixSummary}
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object PropagationUtils {
|
||||
|
||||
implicit val enrichedEntitiesEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
|
||||
|
||||
|
||||
def getSelectedNodes(path: String, spark:SparkSession): Dataset[String] ={
|
||||
|
||||
implicit val stringEncoder: Encoder[String] = Encoders.STRING
|
||||
|
||||
val allowedRelations = spark.read.load(path).as[RelationPropagation]
|
||||
|
||||
|
||||
val pubs_rel : Dataset[RelationPropagation] = allowedRelations
|
||||
.filter(r => r.getSource.getId.startsWith("50")
|
||||
&& r.getTarget.getId.startsWith("60")
|
||||
&& Costants.containedInPubSem(r.getSemantics.toLowerCase()))
|
||||
val dats_rel : Dataset[RelationPropagation] = allowedRelations
|
||||
.filter(r => r.getSource.getId.startsWith("60")
|
||||
&& r.getTarget.getId.startsWith("60")
|
||||
&& Costants.containedInDatsSem(r.getSemantics.toLowerCase())
|
||||
&& r.getSource.getId != r.getTarget.getId)
|
||||
|
||||
pubs_rel.map(r => r.getSource.getId).union(pubs_rel.map(r => r.getTarget.getId))
|
||||
.union(dats_rel.map(r => r.getSource.getId)).union(dats_rel.map(r => r.getTarget.getId)).distinct()
|
||||
}
|
||||
|
||||
|
||||
def getSubjectList(value: util.List[String], scheme: util.List[String]): util.List[SchemeValue] = {
|
||||
var subjects = new ListBuffer[SchemeValue]()
|
||||
|
||||
var i = 0
|
||||
for (elem <- value.asScala) {
|
||||
val sv :SchemeValue = new SchemeValue()
|
||||
sv.setScheme(scheme.get(i))
|
||||
sv.setValue(elem)
|
||||
subjects += sv
|
||||
i += 1
|
||||
}
|
||||
|
||||
subjects.toList.asJava
|
||||
|
||||
}
|
||||
|
||||
|
||||
def propagateDataset (item: ((String, PropagationStructure), (String, DatasetPropagationStructure))) : List[(String, PropagationStructure)] = {
|
||||
val lst = new ListBuffer[(String,PropagationStructure)]()
|
||||
lst += item._1
|
||||
if(item._2 != null){
|
||||
|
||||
val propagation = item._1._2.getPropagation.asScala
|
||||
val dsprob: DatasetPropagationStructure = item._2._2
|
||||
val source = dsprob.getPropagation.keySet().iterator().next()
|
||||
val dic = new scala.collection.mutable.HashMap[String, util.List[PropagationUse]]
|
||||
|
||||
propagation.keysIterator.foreach(key => {
|
||||
val entries = propagation.get(key).get.asScala
|
||||
|
||||
entries.foreach(entry => {
|
||||
if ((entry.getUse == dsprob.getPropagation.get(source).getUse || dsprob.getPropagation.get(source).getUse == "proxy")
|
||||
&& !entry.getPath.contains(source)) {
|
||||
var new_p = entry.getWeight.toDouble * dsprob.getPropagation.get(source).getWeight.toDouble
|
||||
if (new_p > 0.3) {
|
||||
var newentry: PropagationUse = PropagationUse.copyInstance(entry)
|
||||
newentry.setWeight(String.valueOf(new_p))
|
||||
if(!newentry.getPath.contains(item._1._1))
|
||||
newentry.getPath.add(item._1._1)
|
||||
if (!dic.keySet.contains(key)) {
|
||||
dic.put(key, new util.ArrayList[PropagationUse]())
|
||||
}
|
||||
|
||||
dic(key).add(newentry)
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
var ps: PropagationStructure = new PropagationStructure
|
||||
ps.setPropagation(dic.asJava)
|
||||
lst += ((source, ps))
|
||||
}
|
||||
lst.toList
|
||||
}
|
||||
|
||||
def enrichScholix(summary_path: String, spark: SparkSession): Dataset[EnrichedEntries] = {
|
||||
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
|
||||
spark.read.load(summary_path).as[ScholixSummary]
|
||||
.map(ss => {
|
||||
val ee: EnrichedEntries = new EnrichedEntries()
|
||||
ee.setScholixId(ss.getId)
|
||||
ee.setTitle(ss.getTitle)
|
||||
if (ss.getDescription != null) {
|
||||
ee.setDescription(List(ss.getDescription).asJava)
|
||||
} else {
|
||||
ee.setDescription(new util.ArrayList[String]())
|
||||
}
|
||||
if (ss.getSubject != null) {
|
||||
ee.setSubject(ss.getSubject)
|
||||
} else {
|
||||
ee.setSubject(new util.ArrayList[SchemeValue]())
|
||||
}
|
||||
if (ee.getDescription.size() > 0 && ee.getSubject.size() > 0){
|
||||
ee
|
||||
}
|
||||
else{
|
||||
null
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def mergeEnrichments(item : ((String, EnrichedEntries), (String, EnrichedEntries)) ): EnrichedEntries = {
|
||||
if (item._1 == null)
|
||||
item._2._2
|
||||
|
||||
if (item._2 == null)
|
||||
item._1._2
|
||||
|
||||
item._2._2.mergeWith(item._1._2)
|
||||
}
|
||||
|
||||
|
||||
/*#reads the scholixexplorer scholix dump. It filters out records with prefix 70 (unknown)
|
||||
input = sc.textFile('/user/dnet.scholexplorer/scholix/provision/scholix_dump/scholix_json').map(json.loads).filter(lambda x: x['source']['dnetIdentifier'][0:2]!='70')
|
||||
sources = input.map(lambda x: x['source']['dnetIdentifier']).distinct()
|
||||
|
||||
#le relazioni sono bidirezionali quindi l'insieme di nodi e' identificato univocamente dai sources
|
||||
nodes = input.map(lambda x: x['source']['dnetIdentifier']).distinct().map(lambda x: prefix + x[3:] if not 'dedup' in x else prefix + x[17:]).distinct().map(lambda x : {'scholix' : x})
|
||||
|
||||
#createa a mapping between the original scholexplorer ids and the ids the records will have in OpenAIRE
|
||||
scholexplorerMapOpenaire = sources.map(lambda x: {'scholexplorer': x, 'openaire': prefix + x[3:] if 'dedup' not in x else prefix + x[17:]})
|
||||
scholexplorerMapOpenaire.map(json.dumps).saveAsTextFile(path = '/tmp/miriam/context_propagation/scholexplorerIdsMapOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
||||
|
||||
#reads the summaries (containing title, subject and description ) for the scholexplorer records
|
||||
summaries = sc.textFile('/user/dnet.scholexplorer/scholix/provision/summary_json').map(json.loads).map(lambda x: {"title": x['title'], "description":[x['description']] if x['description'] is not None else [], "subject":x['subject'], "sid":x['id']})
|
||||
sources = sources.map(lambda x: {'id':x})
|
||||
|
||||
#enriches data with summaries information from scholexplorer
|
||||
sdf = sources.toDF()
|
||||
smdf = summaries.toDF()
|
||||
enriched_sx = sdf.join(smdf, sdf.id == smdf.sid).rdd.map(lambda x: {"id" : x['id'], "abstract": x['description'], "title":x['title'], "subject":x['subject']})
|
||||
enriched_sx.map(json.dumps).saveAsTextFile(path = '/tmp/miriam/context_propagation/scholexplorerEnrichedSX', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
||||
|
||||
|
||||
nodesdf = nodes.toDF()
|
||||
nodesdf.createOrReplaceTempView("ids")
|
||||
|
||||
#associates the dedup openaire identifier to ingested scholix ids
|
||||
dfr = spark.read.json('/tmp/beta_provision/graph/13_graph_blacklisted/relation')
|
||||
relation = dfr.createOrReplaceTempView("relation")
|
||||
mergedIds = spark.sql("SELECT source, target from relation join ids on relation.target = ids.scholix where datainfo.deletedbyinference = false and relclass = 'merges' ")
|
||||
mergedIds.rdd.map(lambda x: {'dedup':x['source'], 'scholixId':x['target']}).map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholixIdsMergedInOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
||||
|
||||
#new execution on dfr = spark.read.json('/tmp/miriam/context_propagation/14_graph_blacklisted/relation')
|
||||
#this will be saved as the one above with the postfix 'production'
|
||||
|
||||
#replace the ingested scholix id with the deduped one
|
||||
mergedIds.createOrReplaceTempView("merged")
|
||||
changeInMerged = spark.sql("Select * from ids left join merged on ids.scholix = merged.target")
|
||||
enrich = changeInMerged.rdd.map(lambda x: x['source'] if x['source'] is not None else x['scholix']).distinct().map(lambda x:{"enrich":x})
|
||||
|
||||
edf = enrich.toDF()
|
||||
edf.createOrReplaceTempView("enrich_ids")
|
||||
|
||||
ddf = spark.read.json('/tmp/beta_provision/graph/13_graph_blacklisted/dataset')
|
||||
ddf.createOrReplaceTempView("dataset")
|
||||
|
||||
#new execution on ddf = spark.read.json('/tmp/miriam/context_propagation/14_graph_blacklisted/dataset')
|
||||
|
||||
#enriches the scholix ingested records with information for title abstract and subject found in OpenAIRE
|
||||
enriched_dataset = spark.sql("select a.* from (select id, title.value title, description.value description, collect_set(named_struct('scheme', MyS.qualifier.classid, 'value', MyS.value)) as subjects from dataset lateral view explode (subject)s as MyS where datainfo.deletedbyinference = false group by title.value, description.value, id) as a join enrich_ids on a.id = enrich_ids.enrich")
|
||||
|
||||
pdf = spark.read.json('/tmp/beta_provision/graph/13_graph_blacklisted/publication')
|
||||
#new execution on pdf = spark.read.json('/tmp/miriam/context_propagation/14_graph_blacklisted/publication')
|
||||
pdf.createOrReplaceTempView("publication")
|
||||
enriched_publication = spark.sql("select a.* from (select id, title.value title, description.value description, collect_set(named_struct('scheme', MyS.qualifier.classid, 'value', MyS.value)) as subjects from publication lateral view explode (subject)s as MyS where datainfo.deletedbyinference = false group by title.value, description.value, id) as a join enrich_ids on a.id = enrich_ids.enrich")
|
||||
|
||||
enriched = enriched_dataset.rdd.map(lambda x: {"id":x['id'], 'title':x['title'], 'abstract':x['description'], 'subject':[{'scheme': subject['scheme'], "value": subject['value']} for subject in x['subjects']]}).union(enriched_publication.rdd.map(lambda x: {"id":x['id'], 'title':x['title'], 'abstract':x['description'], 'subject':[{'scheme': subject['scheme'], "value": subject['value']} for subject in x['subjects']]}))
|
||||
enriched.map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholixIdsEnrichedInOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
||||
enriched.toDF().createOrReplaceTempView("enriched")
|
||||
changeInMerged.createOrReplaceTempView("merged_ids")
|
||||
|
||||
#associo gli openaireid eventualmente deduplicati come l'id scholix originale in openaire
|
||||
scholixIds = spark.sql("select scholix, id, title, abstract, subject from enriched left join merged_ids on id = source").rdd.map(lambda x: {"id":x['scholix'] if x['scholix'] is not None else x['id'], "title":x['title'], "abstract":x['abstract'],'subject':x['subject']})
|
||||
scholixIds.map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholixIdsEnrichedOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
||||
sids=scholixIds.toDF()
|
||||
|
||||
sids.createOrReplaceTempView("scholix")
|
||||
mdf = scholexplorerMapOpenaire.toDF()
|
||||
mdf.createOrReplaceTempView("map")
|
||||
|
||||
#original scholexplorer ids with the enrichment from openaire
|
||||
scholexplorerEnrichedOA = spark.sql("Select scholexplorer, title, abstract, subject from scholix join map on id = map.openaire")
|
||||
scholexplorerEnrichedOA = scholexplorerEnrichedOA.rdd.map(lambda x: {'id': x['scholexplorer'], 'title':x['title'], 'abstract':x['abstract'],'subject':x['subject']} )
|
||||
scholexplorerEnrichedOA.map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholexplorerEnrichedOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
||||
|
||||
|
||||
sxEnriched = enriched_sx.union(scholexplorerEnrichedOA).map(lambda x: (x['id'],x)).groupByKey().map(groupFunction)
|
||||
|
||||
sxEnriched.map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholexplorerEnriched', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
||||
|
||||
#select the set of relevant relationships
|
||||
relation = sc.textFile('/user/dnet.scholexplorer/scholix/provision/scholix_dump/scholix_json').map(json.loads).filter(lambda x: x['source']['dnetIdentifier'][0:2]!='70' and x['target']['dnetIdentifier'][0:2] != '70')
|
||||
pub_rel = relation.filter(lambda x: x['source']['objectType'] == 'publication' and x['target']['objectType'] == 'dataset').map(lambda x: {'source':x['source']['dnetIdentifier'], 'semantics':x['relationship']['name'].lower(), 'target':x['target']['dnetIdentifier']})
|
||||
#167,210,655
|
||||
#contare le semantiche nel sottografo identificato dalle publication con relzioni verso dataset
|
||||
pub_rel.map(lambda x: (x['semantics'], 1)).reduceByKey(lambda a,b: a+b).collect()
|
||||
#[(u'iscitedby', 11542), (u'reviews', 2051), (u'iscompiledby', 499), (u'unknown', 111706), (u'isnewversionof', 27977), (u'requires', 1), (u'isdocumentedby', 747), (u'describes', 211), (u'issourceof', 30877), (u'ismetadataof', 11), (u'isversionof', 269006), (u'ispartof', 454244), (u'issupplementedby', 1517666), (u'obsoletes', 5), (u'isreferencedby', 89986753), (u'isvariantformof', 3688), (u'hasassociationwith', 30), (u'isidenticalto', 293876), (u'haspart', 621177), (u'ismetadatafor', 121), (u'isrelatedto', 70310923), (u'issupplementto', 85460), (u'isoriginalformof', 476), (u'iscontinuedby', 356407), (u'cites', 200336), (u'ispreviousversionof', 24119), (u'hasversion', 273427), (u'isdescribedby', 5), (u'continues', 356582), (u'isreviewedby', 53), (u'documents', 265636), (u'compiles', 177), (u'references', 2004247), (u'isobsoletedby', 2), (u'isderivedfrom', 617)]
|
||||
|
||||
pub_dats_sem = {'issupplementedby':{'use':'latent', 'weight':1.0, 'path':set()}, 'cites':{'use':'reuse', 'weight':1.0, 'path':set()}, 'describes':{'use':'latent', 'weight':1.0, 'path':set()},'references':{'use':'reuse', 'weight':1.0, 'path':set()}, 'documents':{'use':'latent','weight':1, 'path':set()}}
|
||||
pub_rel_subset = pub_rel.filter(lambda x: x['semantics'] in pub_dats_sem)
|
||||
pub_rel_subset.count()
|
||||
#3,988,096
|
||||
|
||||
pubdf = pub_rel_subset.toDF()
|
||||
sxdf = sxEnriched.toDF()
|
||||
|
||||
pubs_enriched = pubdf.join(sxdf, pubdf.source == sxdf.id)
|
||||
|
||||
|
||||
pubs_with_abst = pubs_enriched.rdd.filter(lambda x: x['abstract'] != [] or x['subject'] != []).map(lambda x: {'source':x['source'], 'semantics':x['semantics'], 'target': x['target']})
|
||||
|
||||
pubs_with_abst.count()
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,113 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import java.util
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{DatasetPropagationStructure, EnrichedEntries, Node, PropagationStructure, PropagationUse, RelationPropagation}
|
||||
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
object SparkContextPropagation {
|
||||
|
||||
implicit val relationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
|
||||
implicit val tupleForRelation: Encoder[(String, RelationPropagation)] = Encoders.tuple(Encoders.STRING, relationEncoder)
|
||||
implicit val enrichedEntitiesEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
|
||||
implicit val tupleForEntities: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEntitiesEncoder)
|
||||
|
||||
implicit val mapEncoderPub: Encoder[PropagationStructure] = Encoders.kryo[PropagationStructure]
|
||||
implicit val mapEncoderDats: Encoder[DatasetPropagationStructure] = Encoders.kryo[DatasetPropagationStructure]
|
||||
implicit val tupleForPropagation: Encoder[(String, PropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
implicit val tupleForPropagationDars: Encoder[(String, DatasetPropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderDats)
|
||||
implicit val stringEncoder: Encoder[String] = Encoders.STRING
|
||||
|
||||
def getEnrichedPublications (allowedRelations: Dataset[RelationPropagation], enrichedEntitiesPath: String, spark: SparkSession): Dataset[RelationPropagation] = {
|
||||
|
||||
val startingPropagation = allowedRelations
|
||||
.filter(r => r.getSource.getId.startsWith("50")).map(r => (r.getSource.getId, r))
|
||||
|
||||
val enrichedNodes = spark.read.load(enrichedEntitiesPath).as[EnrichedEntries]
|
||||
.map(e => (e.getScholixId, e))
|
||||
|
||||
startingPropagation.joinWith(enrichedNodes, startingPropagation("_1").equalTo(enrichedNodes("_1"))).map(tuple => tuple._1._2)
|
||||
|
||||
}
|
||||
|
||||
def propagatePublicationDataset(pubs_rel : Dataset[RelationPropagation]): Dataset[(String, PropagationStructure)] ={
|
||||
|
||||
val publication_dataset : Dataset[(String, PropagationStructure)] = pubs_rel.map(r => {
|
||||
val ps = new PropagationStructure
|
||||
|
||||
val pv : List[PropagationUse] = List(PropagationUse.copyInstance(Costants.getPublicationValue(r.getSemantics)))
|
||||
ps.add(r.getSource.getId, pv.asJava)
|
||||
(r.getTarget.getId, ps)
|
||||
|
||||
})
|
||||
publication_dataset.groupByKey(_._1)
|
||||
.agg(PropagationAggregator.getDatasetAggregator().toColumn)
|
||||
}
|
||||
|
||||
|
||||
def propagateDatasetDataset(propagation: Dataset[(String, PropagationStructure)], dataset_dataset : Dataset[(String, DatasetPropagationStructure)], count :Int): Dataset[(String, PropagationStructure)] = {
|
||||
|
||||
val pl2_step1 : Dataset [(String, PropagationStructure)] = propagation.joinWith(dataset_dataset, propagation("_1").equalTo(dataset_dataset("_1")))
|
||||
.flatMap(PropagationUtils.propagateDataset)
|
||||
|
||||
val pl2 : Dataset [(String, PropagationStructure)] = propagation.union(pl2_step1).groupByKey(_._1)
|
||||
.agg(PropagationAggregator.getDatasetAggregator().toColumn)
|
||||
|
||||
pl2
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertDatasetToJson.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/dataset2Json.json")))
|
||||
parser.parseArgument(args)
|
||||
val conf = new SparkConf
|
||||
val spark = SparkSession.builder.config(conf).appName(SparkConvertDatasetToJson.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
val propagationOutputPath = parser.get("propagationOutputPath")
|
||||
|
||||
val allowedRelations = spark.read.load(parser.get("allowedRelationPath")).as[RelationPropagation]
|
||||
|
||||
val dataset_dataset : Dataset[(String, DatasetPropagationStructure)] = allowedRelations.filter(r => r.getSource.getId.startsWith("60"))
|
||||
.map(r => {
|
||||
val ps = new DatasetPropagationStructure
|
||||
|
||||
ps.add(r.getTarget.getId, PropagationUse.copyInstance(Costants.getDatasetValue(r.getSemantics)))
|
||||
(r.getSource.getId, ps)
|
||||
|
||||
})
|
||||
|
||||
|
||||
val pl1 : Dataset[(String, PropagationStructure)] = propagatePublicationDataset(
|
||||
getEnrichedPublications(allowedRelations, parser.get("enrichedEntitiesPath"), spark ))
|
||||
|
||||
|
||||
pl1.write.mode(SaveMode.Overwrite).save(s"$propagationOutputPath/pl1")
|
||||
|
||||
var propagation = pl1
|
||||
|
||||
var count = 1
|
||||
|
||||
do {
|
||||
|
||||
count += 1
|
||||
propagation = propagateDatasetDataset(propagation, dataset_dataset, count )
|
||||
propagation.write.mode(SaveMode.Overwrite).save(s"$propagationOutputPath/pl${count}" )
|
||||
|
||||
}
|
||||
while(propagation.count() > 0)
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{DatasetPropagationStructure, EnrichedEntries, MapSxOA, Node, PropagationStructure, PropagationUse, RelationPropagation}
|
||||
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
|
||||
/**
|
||||
* It takes the summaries of the scholexplorer nodes involved in propagation
|
||||
*/
|
||||
object SparkEnrichScholixStep1 {
|
||||
|
||||
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
|
||||
|
||||
def getEnrichedSubset(scholixSelectedRelationPath:String, summaryPath: String, spark:SparkSession): Dataset[EnrichedEntries] = {
|
||||
|
||||
//selects the scholix nodes involved in propagation
|
||||
val distinctNodes: Dataset[String] = PropagationUtils.getSelectedNodes(scholixSelectedRelationPath , spark)
|
||||
|
||||
val scholixSummaries = PropagationUtils.enrichScholix(summaryPath, spark).filter(o => o != null)
|
||||
.map(e => (e.getScholixId, e))
|
||||
|
||||
//enriches the selected nodes with summary from scholexplorer
|
||||
distinctNodes.joinWith(scholixSummaries, distinctNodes("value").equalTo(scholixSummaries("_1"))).map(pair => pair._2._2).filter(o => o != null)
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEnrichScholixStep1.getClass.getResourceAsStream("/eu/dnetlib/dhp/contextpropagation/enrich-scholexplorer.json")))
|
||||
parser.parseArgument(args)
|
||||
val conf = new SparkConf
|
||||
val spark = SparkSession.builder.config(conf).appName(SparkEnrichScholixStep1.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
|
||||
getEnrichedSubset(parser.get("inputPath"), parser.get("scholixSummaryPath"), spark)
|
||||
.write.mode(SaveMode.Overwrite).save(parser.get("outputPath"))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, MapSxOA}
|
||||
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
|
||||
/**
|
||||
* it takes enrichment from OpenAIRE. It considers only deduped entities since those not deduped have their
|
||||
* enrichment directly from scholexplorer
|
||||
*
|
||||
* One step for each result type
|
||||
*/
|
||||
object SparkEnrichScholixStep2 {
|
||||
|
||||
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
|
||||
implicit val mapEncoder: Encoder[MapSxOA] = Encoders.kryo[MapSxOA]
|
||||
implicit val tupleForJoinMap: Encoder[(String, MapSxOA)] = Encoders.tuple(Encoders.STRING, mapEncoder)
|
||||
|
||||
def getMappingScholexplorerOpenAIRE(scolixPath: String, spark:SparkSession): Dataset[MapSxOA] ={
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val mapEncoder: Encoder[MapSxOA] = Encoders.kryo[MapSxOA]
|
||||
|
||||
var prefix = "50|scholix_____::"
|
||||
spark.read.load(scolixPath).as[Scholix]
|
||||
.map(s => s.getSource.getDnetIdentifier)(Encoders.STRING)
|
||||
.filter(id => !id.startsWith("70|"))
|
||||
.distinct()
|
||||
.map(id => {
|
||||
val map : MapSxOA = new MapSxOA()
|
||||
if(id.contains("dedup")){
|
||||
map.setOaid(prefix + id.substring(17))
|
||||
}else{
|
||||
map.setOaid(prefix + id.substring(3))
|
||||
}
|
||||
map.setScholixId(id)
|
||||
map
|
||||
})
|
||||
}
|
||||
|
||||
def enrichOpenAIRE(resourcePath: String, relationPath : String, spark:SparkSession): Dataset[EnrichedEntries] = {
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
|
||||
|
||||
val result = spark.read.json(resourcePath)
|
||||
val relation = spark.read.json(relationPath)
|
||||
|
||||
|
||||
result.createOrReplaceTempView("result")
|
||||
relation.createOrReplaceTempView("relation")
|
||||
|
||||
spark.sql("SELECT id, target, title.value title, description.value description, subject.value svalues, subject.qualifier.classid sscheme " +
|
||||
" FROM relation" +
|
||||
" JOIN result " +
|
||||
" ON relation.source = result.id " +
|
||||
" WHERE relation.datainfo.deletedbyinference = false " +
|
||||
" AND relclass = 'merges'" +
|
||||
" AND relation.target like '50|scholix%' "
|
||||
|
||||
)
|
||||
.map(line => {
|
||||
val ee : EnrichedEntries = new EnrichedEntries()
|
||||
ee.setOpenAireId(line.getString(1))
|
||||
ee.setTitle(line.getList(2))
|
||||
ee.setDescription(line.getList(3))
|
||||
ee.setSubject(PropagationUtils.getSubjectList(line.getList(4), line.getList(5)))
|
||||
ee
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
def getEnrichedSubset(scholixPath:String, relationPath:String, resultPath: String, spark:SparkSession): Dataset[EnrichedEntries] = {
|
||||
|
||||
val openAireInfo = enrichOpenAIRE(resultPath, relationPath, spark)
|
||||
.map(r => (r.getOpenAireId, r))
|
||||
|
||||
val mapping = getMappingScholexplorerOpenAIRE(scholixPath , spark).map(m => (m.getOaid, m))
|
||||
|
||||
mapping.joinWith(openAireInfo, mapping("_1").equalTo(openAireInfo("_1"))).map(t => {
|
||||
val ret :EnrichedEntries = t._2._2
|
||||
ret.setScholixId(t._1._2.getScholixId)
|
||||
ret
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEnrichScholixStep2.getClass.getResourceAsStream("/eu/dnetlib/dhp/contextpropagation/enrich-openaire.json")))
|
||||
parser.parseArgument(args)
|
||||
val conf = new SparkConf
|
||||
val spark = SparkSession.builder.config(conf).appName(SparkEnrichScholixStep2.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
|
||||
getEnrichedSubset(parser.get("scholixPath"), parser.get("relationPath"), parser.get("resultPath"), spark)
|
||||
.filter(o => o != null)
|
||||
.write.mode(SaveMode.Overwrite).save(parser.get("outputPath"))
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, MapSxOA}
|
||||
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
/**
|
||||
* It puts together the outcome of the two previous step to get all the enrichemnts in one single entry
|
||||
*/
|
||||
object SparkEnrichScholixStep3 {
|
||||
|
||||
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
|
||||
|
||||
|
||||
def getEnriched(scholixPath:String, openairePath: String, spark:SparkSession): Dataset[EnrichedEntries] = {
|
||||
|
||||
spark.read.load(scholixPath).as[EnrichedEntries]
|
||||
.union(spark.read.load(s"$openairePath/publication").as[EnrichedEntries])
|
||||
.union(spark.read.load(s"$openairePath/dataset").as[EnrichedEntries])
|
||||
.union(spark.read.load(s"$openairePath/software").as[EnrichedEntries])
|
||||
.union(spark.read.load(s"$openairePath/otherresearchproduct").as[EnrichedEntries])
|
||||
.map(ee => (ee.getScholixId, ee))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(PropagationAggregator.mergeEnrichedEntries().toColumn).map(c => c._2)
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEnrichScholixStep3.getClass.getResourceAsStream("/eu/dnetlib/dhp/contextpropagation/enrich-scholexplorer.json")))
|
||||
parser.parseArgument(args)
|
||||
val conf = new SparkConf
|
||||
val spark = SparkSession.builder.config(conf).appName(SparkEnrichScholixStep3.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
|
||||
getEnriched(parser.get("scholixEnrichedPath"), parser.get("openaireEnrichedPath"), spark)
|
||||
.write.mode(SaveMode.Overwrite).save(parser.get("outputPath"))
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
/**
|
||||
* Selects all the Scholexplorer relations not involving nodes with prefix 70 (unknown) and for which the source node
|
||||
* is different from the target node
|
||||
*/
|
||||
|
||||
import java.util
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{Node, RelationPropagation}
|
||||
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
|
||||
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixEntityId}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkSelectScholixRelations {
|
||||
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
|
||||
|
||||
|
||||
def getPublisherList(item: List[ScholixEntityId]) : util.List[String] =
|
||||
{
|
||||
item.map(p=>p.getName).asJava
|
||||
}
|
||||
|
||||
|
||||
def getAllowedRelations(scholixPath: String, spark:SparkSession): Dataset[RelationPropagation] = {
|
||||
spark.read.load(scholixPath).as[Scholix]
|
||||
.filter(s => !s.getSource().getDnetIdentifier().substring(0,2).equals("70") )
|
||||
.filter(s => !s.getTarget().getDnetIdentifier().substring(0,2).equals("70"))
|
||||
.filter(s => !s.getSource.getDnetIdentifier.equals(s.getTarget.getDnetIdentifier))
|
||||
.map(s => {
|
||||
val rp = new RelationPropagation
|
||||
if(s.getSource.getPublisher != null)
|
||||
rp.setSource(Node.newInstance(s.getSource.getDnetIdentifier, getPublisherList(s.getSource.getPublisher.asScala.toList)))
|
||||
else
|
||||
rp.setSource(Node.newInstance(s.getSource.getDnetIdentifier, new util.ArrayList()))
|
||||
if(s.getTarget.getPublisher != null)
|
||||
rp.setTarget(Node.newInstance(s.getTarget.getDnetIdentifier, getPublisherList(s.getTarget.getPublisher.asScala.toList)))
|
||||
else
|
||||
rp.setTarget(Node.newInstance(s.getTarget.getDnetIdentifier, new util.ArrayList()))
|
||||
rp.setSemantics(s.getRelationship.getName.toLowerCase())
|
||||
rp
|
||||
})
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSelectScholixRelations.getClass.getResourceAsStream("/eu/dnetlib/dhp/contextpropagation/enrich-scholexplorer.json")))
|
||||
parser.parseArgument(args)
|
||||
val conf = new SparkConf
|
||||
val spark = SparkSession.builder.config(conf).appName(SparkSelectScholixRelations.getClass.getSimpleName).master(parser.get("master")).getOrCreate
|
||||
|
||||
|
||||
getAllowedRelations(parser.get("inputPath") , spark).write.mode(SaveMode.Overwrite).save(parser.get("outputPath"))
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.dhp.contextpropagation.model.PropagationUse;
|
||||
|
||||
public class DatasetPropagationStructure implements Serializable {
|
||||
|
||||
private Map<String, PropagationUse> propagation = new HashMap<>();
|
||||
|
||||
public Map<String, PropagationUse> getPropagation() {
|
||||
return propagation;
|
||||
}
|
||||
|
||||
public void add(String key, PropagationUse value) {
|
||||
propagation.put(key, value);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,148 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap;
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.SchemeValue;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.GeoLocation;
|
||||
|
||||
public class EnrichedEntries implements Serializable {
|
||||
private static final Logger log = LoggerFactory.getLogger(EnrichedEntries.class);
|
||||
private String scholixId;
|
||||
private String openAireId;
|
||||
private List<String> title;
|
||||
private List<String> description;
|
||||
private List<SchemeValue> subject;
|
||||
|
||||
public String getScholixId() {
|
||||
return scholixId;
|
||||
}
|
||||
|
||||
public void setScholixId(String scholixId) {
|
||||
this.scholixId = scholixId;
|
||||
}
|
||||
|
||||
public List<String> getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(List<String> title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public List<String> getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(List<String> description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public List<SchemeValue> getSubject() {
|
||||
return subject;
|
||||
}
|
||||
|
||||
public void setSubject(List<SchemeValue> subject) {
|
||||
this.subject = subject;
|
||||
}
|
||||
|
||||
public String getOpenAireId() {
|
||||
return openAireId;
|
||||
}
|
||||
|
||||
public void setOpenAireId(String openAireId) {
|
||||
this.openAireId = openAireId;
|
||||
}
|
||||
|
||||
public EnrichedEntries mergeWith(EnrichedEntries ee) throws JsonProcessingException {
|
||||
if(ee == null){
|
||||
throw new RuntimeException("ERROR: ee is null");
|
||||
}
|
||||
|
||||
if (scholixId == null)
|
||||
scholixId = ee.scholixId;
|
||||
|
||||
if (openAireId == null)
|
||||
openAireId = ee.openAireId;
|
||||
|
||||
try {
|
||||
|
||||
Optional
|
||||
.ofNullable(ee.getDescription())
|
||||
.ifPresent(
|
||||
d -> d
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(this::mergeAbstract));
|
||||
|
||||
Optional
|
||||
.ofNullable((ee.getTitle()))
|
||||
.ifPresent(
|
||||
t -> t
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(this::mergeTitle));
|
||||
|
||||
Optional
|
||||
.ofNullable(ee.getSubject())
|
||||
.ifPresent(
|
||||
s -> s
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(this::mergeSubject));
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Error in merging " + ee.getScholixId(), e);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
private void mergeSubject(SchemeValue sbj) {
|
||||
if (subject == null) {
|
||||
subject = new ArrayList<>();
|
||||
}
|
||||
for (SchemeValue s : subject) {
|
||||
if (s.getValue().equals(sbj.getValue())) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
subject.add(sbj);
|
||||
}
|
||||
|
||||
private void mergeAbstract(String dex) {
|
||||
if (description == null) {
|
||||
description = new ArrayList<>();
|
||||
}
|
||||
merge(dex, description);
|
||||
|
||||
}
|
||||
|
||||
private void mergeTitle(String t) {
|
||||
if (title == null) {
|
||||
title = new ArrayList<>();
|
||||
}
|
||||
merge(t, title);
|
||||
|
||||
}
|
||||
|
||||
private void merge(String st, List<String> lst) {
|
||||
for (String d : lst) {
|
||||
if (d.equals(st))
|
||||
return;
|
||||
}
|
||||
lst.add(st);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class MapSxOA implements Serializable {
|
||||
|
||||
private String scholixId;
|
||||
private String oaid;
|
||||
|
||||
public String getScholixId() {
|
||||
return scholixId;
|
||||
}
|
||||
|
||||
public void setScholixId(String scholixId) {
|
||||
this.scholixId = scholixId;
|
||||
}
|
||||
|
||||
public String getOaid() {
|
||||
return oaid;
|
||||
}
|
||||
|
||||
public void setOaid(String oaid) {
|
||||
this.oaid = oaid;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class Node implements Serializable {
|
||||
private String id;
|
||||
private List<String> publisher;
|
||||
|
||||
public List<String> getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public void setPublisher(ArrayList<String> publisher) {
|
||||
this.publisher = publisher;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public static Node newInstance(String id, List publisher) {
|
||||
Node n = new Node();
|
||||
n.id = id;
|
||||
n.publisher = publisher;
|
||||
return n;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PropagationStructure implements Serializable {
|
||||
private Map<String, List<PropagationUse>> propagation = new HashMap<>();
|
||||
|
||||
public Map<String, List<PropagationUse>> getPropagation() {
|
||||
return propagation;
|
||||
}
|
||||
|
||||
public void add(String key, List<PropagationUse> value) {
|
||||
propagation.put(key, value);
|
||||
}
|
||||
|
||||
public void setPropagation(Map<String, List<PropagationUse>> propagation) {
|
||||
this.propagation = propagation;
|
||||
}
|
||||
|
||||
private void mergeList(PropagationUse use, List<PropagationUse> acc) {
|
||||
if (acc == null) {
|
||||
acc = new ArrayList<>();
|
||||
}
|
||||
for (PropagationUse pu : acc) {
|
||||
if (use.getUse().equals(pu.getUse())) {
|
||||
pu.getPath().addAll(use.getPath());
|
||||
if (Integer.valueOf(pu.getWeight()) < Integer.valueOf(use.getWeight())) {
|
||||
pu.setWeight(use.getWeight());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
acc.add(use);
|
||||
}
|
||||
|
||||
public PropagationStructure mergeFrom(PropagationStructure ps) {
|
||||
if (ps == null)
|
||||
return this;
|
||||
for (String key : ps.propagation.keySet()) {
|
||||
if (propagation.containsKey(key)) {
|
||||
ps.propagation.get(key).forEach(use -> mergeList(use, propagation.get(key)));
|
||||
} else {
|
||||
propagation
|
||||
.put(
|
||||
key,
|
||||
ps.propagation
|
||||
.get(key)
|
||||
.stream()
|
||||
.map(pu -> PropagationUse.copyInstance(pu))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class PropagationUse implements Serializable {
|
||||
private String use;
|
||||
private String weight;
|
||||
private List<String> path;
|
||||
|
||||
public String getUse() {
|
||||
return use;
|
||||
}
|
||||
|
||||
public void setUse(String use) {
|
||||
this.use = use;
|
||||
}
|
||||
|
||||
public String getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void setWeight(String weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public List<String> getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
public void setPath(List<String> path) {
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
public static PropagationUse newInstance(String use, String weight, List<String> path) {
|
||||
PropagationUse pu = new PropagationUse();
|
||||
pu.use = use;
|
||||
pu.weight = weight;
|
||||
pu.path = path;
|
||||
return pu;
|
||||
}
|
||||
|
||||
public static PropagationUse copyInstance(PropagationUse use) {
|
||||
PropagationUse pu = new PropagationUse();
|
||||
pu.path = use.path;
|
||||
pu.weight = use.weight;
|
||||
pu.use = use.use;
|
||||
return pu;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class Publisher extends ArrayList<String> implements Serializable {
|
||||
public Publisher() {
|
||||
super();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
|
||||
package eu.dnetlib.dhp.contextpropagation.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.dhp.contextpropagation.model.Node;
|
||||
|
||||
public class RelationPropagation implements Serializable {
|
||||
private Node source;
|
||||
private Node target;
|
||||
private String semantics;
|
||||
|
||||
public RelationPropagation() {
|
||||
}
|
||||
|
||||
public Node getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(Node source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public Node getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public void setTarget(Node target) {
|
||||
this.target = target;
|
||||
}
|
||||
|
||||
public String getSemantics() {
|
||||
return semantics;
|
||||
}
|
||||
|
||||
public void setSemantics(String semantics) {
|
||||
this.semantics = semantics;
|
||||
}
|
||||
}
|
|
@ -24,8 +24,6 @@ public class Constants {
|
|||
|
||||
public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative";
|
||||
|
||||
public static String ORCID = "orcid";
|
||||
|
||||
static {
|
||||
accessRightsCoarMap.put("OPEN", "c_abf2");
|
||||
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
|
||||
|
|
|
@ -503,7 +503,7 @@ public class ResultMapper implements Serializable {
|
|||
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(Constants.ORCID)) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
|
|
|
@ -68,7 +68,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
|
||||
protected static final Qualifier ORCID_PID_TYPE = qualifier(
|
||||
"ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES);
|
||||
ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, DNET_PID_TYPES, DNET_PID_TYPES);
|
||||
protected static final Qualifier MAG_PID_TYPE = qualifier(
|
||||
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ import com.google.common.collect.Lists;
|
|||
|
||||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
|
@ -61,7 +62,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
author.setPid(new ArrayList<>());
|
||||
|
||||
if (StringUtils.isNotBlank(pid)) {
|
||||
if (type.startsWith("ORCID")) {
|
||||
if (type.toLowerCase().startsWith(ORCID)) {
|
||||
final String cleanedId = pid
|
||||
.replaceAll("http://orcid.org/", "")
|
||||
.replaceAll("https://orcid.org/", "");
|
||||
|
|
|
@ -20,6 +20,7 @@ import com.google.common.collect.Lists;
|
|||
|
||||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
|
@ -98,7 +99,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
.replaceAll(" ", "")
|
||||
.replaceAll("_", "");
|
||||
|
||||
if (type.startsWith("ORCID")) {
|
||||
if (type.toLowerCase().startsWith(ModelConstants.ORCID)) {
|
||||
final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", "");
|
||||
res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
|
||||
} else if (type.startsWith("MAGID")) {
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
[{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
}
|
||||
,{
|
||||
"paramName": "sp",
|
||||
"paramLongName": "scholixPath",
|
||||
"paramDescription": "the path of the scholix summaries",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rp",
|
||||
"paramLongName": "relationPath",
|
||||
"paramDescription": "the openaire graph input path",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the output path for the selected scholix relations",
|
||||
"paramRequired": false
|
||||
},{
|
||||
"paramName": "rePath",
|
||||
"paramLongName": "resultPath",
|
||||
"paramDescription": "the output path for the selected scholix relations",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,35 @@
|
|||
[{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
}
|
||||
,{
|
||||
"paramName": "ssp",
|
||||
"paramLongName": "scholixSummaryPath",
|
||||
"paramDescription": "the path of the scholix summaries",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the openaire graph input path",
|
||||
"paramRequired": false
|
||||
},{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the output path for the selected scholix relations",
|
||||
"paramRequired": false
|
||||
},{
|
||||
"paramName": "sep",
|
||||
"paramLongName": "scholixEnrichedPath",
|
||||
"paramDescription": "the output path for the selected scholix relations",
|
||||
"paramRequired": false
|
||||
},{
|
||||
"paramName": "oep",
|
||||
"paramLongName": "openaireEnrichedPath",
|
||||
"paramDescription": "the output path for the selected scholix relations",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -0,0 +1,77 @@
|
|||
<configuration>
|
||||
|
||||
<!-- OCEAN -->
|
||||
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
||||
<!-- <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>-->
|
||||
<!-- </property>-->
|
||||
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<!-- GARR -->
|
||||
|
||||
<!-- <property>-->
|
||||
<!-- <name>jobTracker</name>-->
|
||||
<!-- <value>yarn</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>nameNode</name>-->
|
||||
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>hiveMetastoreUris</name>-->
|
||||
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
||||
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
|
||||
<!-- </property>-->
|
||||
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2EventLogDir</name>-->
|
||||
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2ExtraListeners</name>-->
|
||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
|
||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
|
||||
<!-- </property>-->
|
||||
</configuration>
|
||||
|
|
@ -0,0 +1,290 @@
|
|||
<workflow-app name="Context Propagation Preparation" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>scholixPath</name>
|
||||
<description>the Scholix Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>scholixSummaryPath</name>
|
||||
<description>the Scholix Summaries Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>inputPath</name>
|
||||
<description>the OpenAIRE Graph Input Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<description>the target hive database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveJdbcUrl</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="select_scholix_relations"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="select_scholix_relations">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Select Scholix Relations</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.contextpropagation.SparkSelectScholixRelations</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${scholixPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/scholixAllowedRelations</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="enrich_scholix_step1"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="enrich_scholix_step1">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Enrich Scholix Step1</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep1</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/scholixAllowedRelations</arg>
|
||||
<arg>--scholixSummaryPath</arg><arg>${scholixSummaryPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/scholixEnriched</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="fork_enrich_scholix_step2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="fork_enrich_scholix_step2">
|
||||
<path start="enrich_scholix_step2_publication"/>
|
||||
<path start="enrich_scholix_step2_dataset"/>
|
||||
<path start="enrich_scholix_step2_software"/>
|
||||
<path start="enrich_scholix_step2_orp"/>
|
||||
</fork>
|
||||
|
||||
<action name="enrich_scholix_step2_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Enrcih Scholix Step2</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
|
||||
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
|
||||
<arg>--resultPath</arg><arg>${inputPath}/publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/publication</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="join_enrich"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="enrich_scholix_step2_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Enrcih Scholix Step2</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
|
||||
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
|
||||
<arg>--resultPath</arg><arg>${inputPath}/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/dataset</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="join_enrich"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="enrich_scholix_step2_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Enrcih Scholix Step2</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
|
||||
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
|
||||
<arg>--resultPath</arg><arg>${inputPath}/software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/software</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="join_enrich"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="enrich_scholix_step2_orp">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Enrcih Scholix Step2</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
|
||||
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
|
||||
<arg>--resultPath</arg><arg>${inputPath}/otherresearchproduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/otherresearchproduct</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="join_enrich"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_enrich" to="enrich_scholix_step3"/>
|
||||
|
||||
<action name="enrich_scholix_step3">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Enrcih Scholix Step2</name>
|
||||
|
||||
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep3</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--scholixEnrichedPath</arg><arg>${workingDir}/scholixEnriched</arg>
|
||||
<arg>--openaireEnrichedPath</arg><arg>${workingDir}/openaireEnriched</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/enrichedEntities</arg>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,22 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, RelationPropagation}
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
|
||||
class PropagationTest extends java.io.Serializable {
|
||||
|
||||
val m: ObjectMapper = new ObjectMapper()
|
||||
m.enable(SerializationFeature.INDENT_OUTPUT)
|
||||
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
|
||||
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,144 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, RelationPropagation}
|
||||
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.junit.jupiter.api.Assertions.{assertFalse, assertNotNull}
|
||||
|
||||
class ScholixTest extends java.io.Serializable{
|
||||
|
||||
|
||||
val m: ObjectMapper = new ObjectMapper()
|
||||
m.enable(SerializationFeature.INDENT_OUTPUT)
|
||||
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
|
||||
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
|
||||
|
||||
|
||||
@Test
|
||||
def selectScholexplorerRelationTest(): Unit ={
|
||||
val sourcePath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/scholix-relations-00000.parquet").getPath
|
||||
|
||||
val conf : SparkConf = new SparkConf()
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().appName("SelectScholixRelationTest").master("local").config(conf).getOrCreate()
|
||||
|
||||
val tmp = SparkSelectScholixRelations.getAllowedRelations(sourcePath, spark)
|
||||
|
||||
tmp.write.mode(SaveMode.Overwrite).save("/tmp/temp")
|
||||
assert(tmp.count > 0)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def SelectDistinctIDTest(): Unit ={
|
||||
|
||||
val sourcePath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/producedInfo/selectedRelations.parquet").getPath
|
||||
|
||||
val conf : SparkConf = new SparkConf()
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().appName("SelectDistinctIdsTest").master("local").config(conf).getOrCreate()
|
||||
|
||||
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
|
||||
|
||||
val allowedRelations = spark.read.load(sourcePath).as[RelationPropagation]
|
||||
|
||||
val numberOfNodes = allowedRelations.map(r => r.getSource.getId)(Encoders.STRING)
|
||||
.union(allowedRelations.map(r => r.getTarget.getId)(Encoders.STRING)).count()
|
||||
|
||||
val tmp : Dataset[String]= PropagationUtils.getSelectedNodes(sourcePath, spark)
|
||||
|
||||
|
||||
assert (numberOfNodes > tmp.count())
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def mappingScholixOpenAIRETest(): Unit ={
|
||||
val sourcePath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/scholix-relations-00000.parquet").getPath
|
||||
|
||||
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
|
||||
//val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
|
||||
val spark: SparkSession = SparkSession.builder().appName("Test").master("local").config(new SparkConf()).getOrCreate()
|
||||
|
||||
val tmp = SparkEnrichScholixStep2.getMappingScholexplorerOpenAIRE(sourcePath, spark)
|
||||
|
||||
|
||||
tmp.filter(e => e.getScholixId.contains("dedup"))
|
||||
.foreach(e => assertFalse(!(e.getScholixId.substring(17).equals(e.getOaid.substring(17)))))
|
||||
tmp.filter(e => !e.getScholixId.contains("dedup"))
|
||||
.foreach(e => assertFalse(!(e.getOaid.substring(17).equals(e.getScholixId.substring(3)))))
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
def enrichScholixTest():Unit = {
|
||||
val summaryPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/part-00000-summaries.parquet").getPath
|
||||
|
||||
val relationPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/producedInfo/selectedRelations.parquet").getPath
|
||||
|
||||
val conf : SparkConf = new SparkConf()
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf) .appName("Test").master("local").getOrCreate()
|
||||
|
||||
val tmp = SparkEnrichScholixStep1.getEnrichedSubset(relationPath, summaryPath, spark)
|
||||
|
||||
assert(tmp.count() == 5)
|
||||
|
||||
//tmp.write.mode(SaveMode.Overwrite).save("/tmp/scholixEnriched")
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def enrichOpenAIRETest():Unit = {
|
||||
|
||||
val conf : SparkConf = new SparkConf()
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf) .appName("Test").master("local").getOrCreate()
|
||||
|
||||
val scholixPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/scholix-relations-00000.parquet").getPath
|
||||
val relationPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/relation.json").getPath
|
||||
val resultPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/result/publication").getPath
|
||||
|
||||
val tmp = SparkEnrichScholixStep2.getEnrichedSubset(scholixPath, relationPath , resultPath , spark)
|
||||
|
||||
print(tmp.count())
|
||||
|
||||
assert(tmp.count() == 1)
|
||||
|
||||
tmp.write.mode(SaveMode.Overwrite).save("/tmp/openaireEnriched")
|
||||
}
|
||||
|
||||
@Test
|
||||
def mergeEnrichmentsTest():Unit = {
|
||||
|
||||
val conf : SparkConf = new SparkConf()
|
||||
|
||||
|
||||
val spark: SparkSession = SparkSession.builder().config(conf) .appName("Test").master("local").getOrCreate()
|
||||
|
||||
val scholixPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/producedInfo/scholixEnriched.parquet").getPath
|
||||
val resultPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/producedInfo/result").getPath
|
||||
|
||||
val tmp = SparkEnrichScholixStep3.getEnriched(scholixPath, resultPath , spark)
|
||||
|
||||
|
||||
assert(tmp.count() == 5)
|
||||
|
||||
tmp.write.mode(SaveMode.Overwrite).save("/tmp/mergedEnriched")
|
||||
|
||||
tmp.foreach(r => print(m.writeValueAsString(r)))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
package eu.dnetlib.dhp.contextpropagation
|
||||
|
||||
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
|
||||
import eu.dnetlib.dhp.contextpropagation.model.{DatasetPropagationStructure, EnrichedEntries, MapSxOA, Node, PropagationStructure, PropagationUse, RelationPropagation}
|
||||
import eu.dnetlib.dhp.provision.scholix.summary.{SchemeValue, ScholixSummary}
|
||||
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixEntityId}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||
import org.junit.jupiter.api.Assertions.{assertFalse, assertNotNull}
|
||||
import org.junit.jupiter.api.Test
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
class TestProva extends java.io.Serializable{
|
||||
|
||||
|
||||
|
||||
val m: ObjectMapper = new ObjectMapper()
|
||||
m.enable(SerializationFeature.INDENT_OUTPUT)
|
||||
|
||||
|
||||
|
||||
|
||||
// @Test
|
||||
// def testFunderRelationshipsMapping(): Unit = {
|
||||
//
|
||||
//
|
||||
// def findInDats(dats: Dataset[(String, DatasetPropagationStructure)], elem:String) : Dataset[(String, DatasetPropagationStructure)] = {
|
||||
// dats.filter(dats("_1") === elem)
|
||||
// }
|
||||
//
|
||||
//
|
||||
// val sourcePath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/part-00000.parquet").getPath
|
||||
//
|
||||
//
|
||||
// implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
|
||||
// implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
|
||||
// implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
|
||||
// implicit val mapEncoderPub: Encoder[PropagationStructure] = Encoders.kryo[PropagationStructure]
|
||||
// implicit val mapEncoderDats: Encoder[DatasetPropagationStructure] = Encoders.kryo[DatasetPropagationStructure]
|
||||
// implicit val tupleForPropagation: Encoder[(String, PropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
// implicit val tupleForPropagationDars: Encoder[(String, DatasetPropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderDats)
|
||||
// implicit val stringEncoder: Encoder[String] = Encoders.STRING
|
||||
//
|
||||
//
|
||||
// val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
|
||||
//
|
||||
//
|
||||
// val ds: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
|
||||
//
|
||||
// val allowedRelations : Dataset[RelationPropagation] = ds
|
||||
// .filter(s => !s.getSource().getDnetIdentifier().substring(0,2).equals("70") )
|
||||
// .filter(s => !s.getTarget().getDnetIdentifier().substring(0,2).equals("70"))
|
||||
// .map(s => {
|
||||
// val rp = new RelationPropagation
|
||||
// rp.setSource(Node.newInstance(s.getSource.getDnetIdentifier))//, getPublisherList(s.getSource.getPublisher.asScala.toList)))
|
||||
// rp.setTarget(Node.newInstance(s.getTarget.getDnetIdentifier))//, getPublisherList(s.getTarget.getPublisher.asScala.toList)))
|
||||
// rp.setSemantics(s.getRelationship.getName)
|
||||
// rp
|
||||
// })
|
||||
//
|
||||
//
|
||||
// //println(allowedRelations.count())
|
||||
//
|
||||
// val pubs_rel : Dataset[RelationPropagation] = allowedRelations.filter(r => r.getSource.getId.startsWith("50"))
|
||||
// .filter(r => r.getTarget.getId.startsWith("60")).filter(r => Costants.containedInPubSem(r.getSemantics.toLowerCase()))
|
||||
//
|
||||
// val dats_rel : Dataset[RelationPropagation] = allowedRelations
|
||||
// .filter(r => r.getSource.getId.startsWith("60")
|
||||
// && r.getTarget.getId.startsWith("60")
|
||||
// && Costants.containedInDatsSem(r.getSemantics.toLowerCase())
|
||||
// && r.getSource.getId != r.getTarget.getId)
|
||||
//
|
||||
// val publication_dataset : Dataset[(String, PropagationStructure)] = pubs_rel.map(r => {
|
||||
// val ps = new PropagationStructure
|
||||
//
|
||||
// val pv : List[PropagationUse] = List(PropagationUse.copyInstance(Costants.getPublicationValue(r.getSemantics.toLowerCase())))
|
||||
// ps.add(r.getSource.getId, pv.asJava)
|
||||
// (r.getTarget.getId, ps)
|
||||
//
|
||||
// })
|
||||
//
|
||||
//
|
||||
// val pl1 : Dataset[(String, PropagationStructure)] = publication_dataset.groupByKey(_._1)(Encoders.STRING)
|
||||
// .agg(PropagationAggregator.getDatasetAggregator().toColumn)
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// // print(pl1.count)
|
||||
//
|
||||
// val dataset_dataset : Dataset[(String, DatasetPropagationStructure)] = dats_rel.map(r => {
|
||||
// val ps = new DatasetPropagationStructure
|
||||
//
|
||||
// ps.add(r.getTarget.getId, PropagationUse.copyInstance(Costants.getDatasetValue(r.getSemantics.toLowerCase())))
|
||||
// (r.getSource.getId, ps)
|
||||
//
|
||||
// })
|
||||
////
|
||||
//// //pl1.foreach(r => print(m.writeValueAsString(r._1)))
|
||||
////
|
||||
////
|
||||
////
|
||||
// val dataset_dataset_modified : Dataset[(String, DatasetPropagationStructure)] =
|
||||
// dataset_dataset.map(ds => {
|
||||
// if(ds._1 == "60|4b5e9fa8e91b206001589993179f69d1"){
|
||||
// ("60|82368200e90cf75c714b58288a371bbe", ds._2)
|
||||
// }
|
||||
// else{
|
||||
// ds
|
||||
// }
|
||||
// })
|
||||
////
|
||||
//// // findInDats(dataset_dataset_modified, "60|82368200e90cf75c714b58288a371bbe").show(false)
|
||||
////
|
||||
////
|
||||
// val pl2_step1 = pl1.joinWith(dataset_dataset_modified, pl1("value")
|
||||
// .equalTo(dataset_dataset_modified("_1")), "left")
|
||||
// .flatMap(PropagationUtils.propagateDataset)
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// val pl2= pl2_step1.groupByKey(_._1)(Encoders.STRING).agg(PropagationAggregator.getDatasetAggregator().toColumn)
|
||||
// print(pl2.count())
|
||||
//
|
||||
//
|
||||
//// pl1.foreach(i=> {
|
||||
//// if (i._1 =="60|b91b1296e3e37523887c2eaaf3f2e673")
|
||||
//// print(m.writeValueAsString(i))
|
||||
//// })
|
||||
////
|
||||
//// print(pl1.count)
|
||||
//
|
||||
////
|
||||
//
|
||||
// // print(m.writeValueAsString(dsprob.getPropagation.get(source).getUse))
|
||||
//
|
||||
//// print(dataset_dataset.map(d => {
|
||||
//// var found : Boolean = false
|
||||
//// for (elem <- d._2.getPropagation.keySet().asScala){
|
||||
//// if (d._2.getPropagation.get(elem).getUse == "proxy"){
|
||||
//// found = true
|
||||
//// }
|
||||
//// }
|
||||
//// if (found){
|
||||
//// d
|
||||
//// }else{
|
||||
//// null
|
||||
//// }
|
||||
//// }).filter(o => o != null).first()._1)
|
||||
//
|
||||
//
|
||||
//// dataset_dataset.foreach(d => {
|
||||
////
|
||||
//// for (elem <- d._2.getPropagation.keySet().asScala){
|
||||
//// if (d._2.getPropagation.get(elem).getUse == "reuse"){
|
||||
//// print("reuse")
|
||||
//// }
|
||||
//// }
|
||||
//// println()
|
||||
//// })
|
||||
//
|
||||
// }
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-prod","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"properties":[],"relClass":"merges","relType":"resultResult","source":"50|dedup_wf_001::000239c0f7ec8507afd7e02b4a853b56","subRelType":"dedup","target":"50|scholix_____::e6c6f093eb4f8c48201c157f5fcdd8f8"}
|
||||
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite"}],"dataInfo":{"deletedbyinference":false,"invisible":false,"trust":"0.9"},"properties":[],"relClass":"merges","relType":"resultResult","source":"50|dedup_wf_001::002cf1de4469a0a318fdd1ff009659ec","subRelType":"relationship","target":"50|scholix_____::08d3a09fc700d2f614556cdd23762ad7"}
|
||||
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite"}],"dataInfo":{"deletedbyinference":false,"invisible":false,"trust":"0.9"},"properties":[],"relClass":"merges","relType":"resultResult","source":"50|dedup_wf_001::00310fc57d006a502e06411f3ab35424","subRelType":"relationship","target":"50|scholix_____::e17f731657c15f24c42bfca61c26b113"}
|
||||
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_similarities_standard","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1612386060902,"properties":[{"key":"similarityLevel","value":"0.7032"}],"relClass":"hasAmongTopNSimilarDocuments","relType":"resultResult","source":"50|dedup_wf_001::0031d1f2103ebb2979c785e1b00b2319","subRelType":"similarity","target":"50|dedup_wf_001::c8ae7b6f575767dbebb18d58870b582b"}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-prod","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"properties":[],"relClass":"isMergedIn","relType":"resultResult","source":"50|datacite____::f97dc7ffbd237a68b9954095dd56dd91","subRelType":"dedup","target":"50|dedup_wf_001::569ad6db85b9568dfbd388a749c479f8"}
|
||||
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_referencedProjects","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.6573"},"lastupdatetimestamp":1612386075869,"properties":[],"relClass":"produces","relType":"resultProject","source":"40|aka_________::02c787a3a97d7bd6946672a8ec74ecfe","subRelType":"outcome","target":"50|dedup_wf_001::27514076973e90990e5cd9205fcc5317"}
|
||||
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:project:semrel","classname":"result:project:semrel","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"trust":"0.85"},"properties":[],"relClass":"produces","relType":"resultProject","source":"40|anr_________::d3c7c989a9e114593c7cb8f77edde5a3","subRelType":"outcome","target":"50|scholix_____::b79951545b294686860f14471f174ccc"}
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue