Compare commits

...

42 Commits

Author SHA1 Message Date
Miriam Baglioni 6b79d1cf2a - 2021-07-13 12:20:51 +02:00
Miriam Baglioni 87018ac895 merge with master 2021-07-13 12:17:37 +02:00
Miriam Baglioni c2ef9e3856 merge upstream 2021-06-22 14:35:55 +02:00
Miriam Baglioni 59c36eb185 check if pid is null (to avoid NullPointerException) 2021-06-21 10:41:47 +02:00
Miriam Baglioni 2d5f9e8e1c Merge branch 'master' of code-repo.d4science.org:miriam.baglioni/dnet-hadoop 2021-06-21 10:02:33 +02:00
Miriam Baglioni 066e1dc772 added new dependency ag common-text 2021-06-21 09:26:40 +02:00
Miriam Baglioni 00dfaff973 refactoring and changed query that was wrong 2021-06-21 09:23:55 +02:00
Miriam Baglioni 464ac6301c added the reader for the dump of crossref 2021-06-21 09:22:29 +02:00
Miriam Baglioni c07f820c21 - 2021-06-21 09:16:31 +02:00
Miriam Baglioni 2740b95f99 - 2021-06-21 09:16:05 +02:00
Miriam Baglioni ca7e10b3c0 - 2021-06-21 09:15:43 +02:00
Miriam Baglioni 2f6673e678 - 2021-06-21 09:14:32 +02:00
Claudio Atzori 6b8c357381 removed extra whitespace at the end of the file 2021-06-18 16:08:45 +02:00
Claudio Atzori c0d2b62e46 [doiboost] added missing implicit Encoder 2021-06-18 15:57:41 +02:00
Claudio Atzori a3948c1f6e cleanup old doiboost workflows 2021-06-18 15:14:08 +02:00
Claudio Atzori fddbc8364e Merge branch 'alessia.bardi-datepicker' 2021-06-17 09:24:46 +02:00
Alessia Bardi 6208b04f1d smarter DatePicker for ISO dates on dateofacceptance 2021-06-16 14:56:26 +02:00
Sandro La Bruzzo 9ca438d9b1 imported from branch stable_ids generation of Actionset datacite 2021-06-10 14:59:45 +02:00
Sandro La Bruzzo 42ff7a5665 some fix to the pom to compile scala 2021-06-10 14:31:06 +02:00
Sandro La Bruzzo ebe6aa6d38 implemented datacite transformation also on master 2021-06-10 10:52:36 +02:00
Miriam Baglioni 0eda93b3eb - 2021-06-09 13:25:35 +02:00
Miriam Baglioni 72771a1254 - 2021-06-09 13:25:19 +02:00
Miriam Baglioni 6cdc4d3bf3 - 2021-05-31 11:07:24 +02:00
Miriam Baglioni a106353cee - 2021-05-31 11:00:30 +02:00
Miriam Baglioni 5d8257b288 added code for ircdl_extention 2021-05-31 10:59:58 +02:00
Claudio Atzori a4cfabdbc6 Merge pull request 'master' (#111) from antonis.lempesis/dnet-hadoop:master into master
Reviewed-on: D-Net/dnet-hadoop#111
2021-05-28 14:09:12 +02:00
Claudio Atzori 338327171d integrating pull #109, H2020Classification 2021-05-27 11:57:01 +02:00
Claudio Atzori 6cbda49112 more pervasive use of constants from ModelConstants, especially for ORCID 2021-05-26 18:13:04 +02:00
Claudio Atzori ea9b00ce56 adjusted test 2021-05-20 15:31:42 +02:00
Claudio Atzori 2e70aa43f0 Merge pull request 'H2020Classification fix and possibility to add datasources in blacklist for propagation of result to organization' (#108) from miriam.baglioni/dnet-hadoop:master into master
Reviewed-on: D-Net/dnet-hadoop#108

The changes look ok, but please drop a comment to describe how the parameters should be changed from the workflow caller for both workflows
* H2020Classification
* propagation of result to organization
2021-05-20 15:25:05 +02:00
Antonis Lempesis 168edcbde3 added the final steps for the observatory promote wf and some cleanup 2021-05-18 15:23:20 +03:00
Antonis Lempesis 625d993cd9 added step for observatory db 2021-04-20 02:31:06 +03:00
Antonis Lempesis 25d0512fbd code cleanup 2021-04-20 01:43:23 +03:00
Miriam Baglioni 9d617a0a58 adding test and resouces 2021-03-30 10:26:51 +02:00
Miriam Baglioni d69c19e3fe adding test and resouces 2021-03-30 10:26:03 +02:00
Miriam Baglioni efd34c63ae adding sources 2021-03-30 10:22:11 +02:00
Miriam Baglioni 3214101a75 - 2021-02-24 11:28:31 +01:00
Miriam Baglioni fe7a7f2415 - 2021-02-24 10:19:03 +01:00
Miriam Baglioni a9fbd5b22d testing for context propagation 2021-02-23 10:44:59 +01:00
Miriam Baglioni 4c12e9664e added dependency for scholexplorer 2021-02-08 10:37:10 +01:00
Miriam Baglioni fb9e4a2769 testing for context propagation 2021-02-08 10:32:46 +01:00
Miriam Baglioni 7572069f98 context propagation 2021-02-08 10:32:13 +01:00
108 changed files with 14842 additions and 493 deletions

View File

@ -115,6 +115,8 @@ public class AuthorMerger {
}
public static String pidToComparableString(StructuredProperty pid) {
if (pid == null)
return "";
return (pid.getQualifier() != null
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
: "")

View File

@ -7,6 +7,37 @@
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
@ -24,12 +55,6 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-core</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
@ -37,6 +62,13 @@
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-mapper</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
@ -76,7 +108,10 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
</dependency>
</dependencies>

View File

@ -0,0 +1,544 @@
package eu.dnetlib.dhp.actionmanager.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import java.util.regex.Pattern
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
object DataciteToOAFTransformation {
val UNKNOWN_REPOSITORY_ORIGINALID = "openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18"
val DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254"
val DNET_DATACITE_DATE = "dnet:dataCite_date"
val DNET_DATACITE_TITLE = "dnet:dataCite_title"
val SYSIMPORT_ACTIONSET = "sysimport:actionset"
val DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"
val PROVENANCE_ACTION_SET_QUALIFIER: Qualifier = OafMapperUtils.qualifier(SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS)
val MAIN_TITLE_QUALIFIER:Qualifier = OafMapperUtils.qualifier("main title","main title",DNET_DATACITE_TITLE,DNET_DATACITE_TITLE)
implicit val codec: Codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val j_filter: List[String] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
s.lines.toList
}
val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val dataInfo: DataInfo = generateDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(DATACITE_ID, "Datacite")
val hostedByMap: Map[String, HostedByMapType] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(s)
json.extract[Map[String, HostedByMapType]]
}
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
def filter_json(json: String): Boolean = {
j_filter.exists(f => json.contains(f))
}
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
}
).find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
case _: Throwable => try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
return None
}
}
}
d
}
def fix_thai_date(input:String, format:String) :String = {
try {
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
null
}
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
i.setInstancetype(typeQualifiers._1)
typeQualifiers._2.getClassname match {
case "dataset" =>
val r = new OafDataset
r.setInstance(List(i).asJava)
return r
case "publication" =>
val r = new Publication
r.setInstance(List(i).asJava)
return r
case "software" =>
val r = new Software
r.setInstance(List(i).asJava)
return r
case "other" =>
val r = new OtherResearchProduct
r.setInstance(List(i).asJava)
return r
}
null
}
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val l: List[String] = for {
JObject(dates) <- json \\ "dates"
JField("dateType", JString(dateTypes)) <- dates
} yield dateTypes
l.exists(p => p.equalsIgnoreCase("available"))
}
def OPEN_ACCESS_RIGHT = {
val result = new Qualifier
result.setClassid("OPEN")
result.setClassid("OPEN")
result.setSchemeid(ModelConstants.DNET_ACCESS_MODES)
result.setSchemename(ModelConstants.DNET_ACCESS_MODES)
result
}
/**
* As describe in ticket #6377
* when the result come from figshare we need to remove subject
* and set Access rights OPEN.
* @param r
*/
def fix_figshare(r: Result): Unit = {
if (r.getInstance() != null) {
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(OPEN_ACCESS_RIGHT))
val l: List[StructuredProperty] = List()
r.setSubject(l.asJava)
}
}
}
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
if (match_pattern.isDefined) {
val m = match_pattern.get._1
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
)
}
else
List()
}
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup): List[Oaf] = {
if (filter_json(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
val doi = (json \ "attributes" \ "doi").extract[String]
if (doi.isEmpty)
return List()
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (result == null)
return List()
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava)
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
result.setOriginalId(List(doi).asJava)
val d = new Date(dateOfCollection * 1000)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
result.setDateofcollection(ISO8601FORMAT.format(d))
result.setDateoftransformation(ISO8601FORMAT.format(ts))
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val authors = creators.zipWithIndex.map { case (c, idx) =>
val a = new Author
a.setFullname(c.name.orNull)
a.setName(c.givenName.orNull)
a.setSurname(c.familyName.orNull)
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
a.setPid(c.nameIdentifiers.get.map(ni => {
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
}
else
null
}
)
.asJava)
}
if (c.affiliation.isDefined)
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
a.setRank(idx + 1)
a
}
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils.structuredProperty(t.title.get, MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, DNET_DATACITE_TITLE, DNET_DATACITE_TITLE, null)
}
}).asJava)
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
result.setAuthor(authors.asJava)
val dates = (json \\ "dates").extract[List[DateType]]
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
val i_date = dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
.map(d => extract_date(d.date.get))
val a_date: Option[String] = dates
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
.map(d => extract_date(d.date.get))
.find(d => d != null && d.isDefined)
.map(d => d.get)
if (a_date.isDefined) {
if(doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
}
else if (publication_year != null) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d => (d._1.get, vocabularies.getTermAsQualifier(DNET_DATACITE_DATE, d._2.toLowerCase())))
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2)).asJava)
val subjects = (json \\ "subjects").extract[List[SubjectType]]
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
result.setDescription(
descriptions
.filter(d => d.description.isDefined).
map(d =>
OafMapperUtils.field(d.description.get, null)
).filter(s => s != null).asJava)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
val instance = result.getInstance().get(0)
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
val accessRights: List[String] = for {
JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[Qualifier] = accessRights.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
}).find(q => q != null).map(q => {
val a = new Qualifier
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
if (client.isDefined) {
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
val license = accessRights
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
}
val awardUris: List[String] = for {
JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
}
else
List(result)
}
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(PROVENANCE_ACTION_SET_QUALIFIER)
di
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object ExportActionSetJobNode {
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(ExportActionSetJobNode.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
spark.read.load(sourcePath).as[Oaf]
.map(o =>DataciteToOAFTransformation.toActionSet(o))
.filter(o => o!= null)
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
import spark.implicits._
spark.read.load(sourcePath).as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies))
.filter(d => d != null)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
@ -33,7 +32,6 @@ public class PrepareProjects {
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils

View File

@ -0,0 +1,66 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
public class PrepareCrossrefSpark {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareCrossrefSpark.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String inputPath = parser.get("inputPath");
final String outputPath = parser.get("outputPath");
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
selectResult(spark, inputPath, outputPath);
});
}
private static Dataset<Result> selectResult(SparkSession spark, String input_path, String output_path) {
Dataset<Result> res = Utils
.readPath(
spark, input_path, Result.class)
.filter(
(FilterFunction<Result>) r -> !r.getId().startsWith("50|dedup") &&
r.getCf().stream().anyMatch(cf -> cf.getValue().equals("Crossref")));
res.write().option("compression", "gzip").mode(SaveMode.Overwrite).json(output_path);
return res;
}
}

View File

@ -0,0 +1,81 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
public class PrepareDataciteSpark {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareDataciteSpark.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String inputPath = parser.get("inputPath");
final String outputPath = parser.get("outputPath");
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
exec(spark, outputPath, inputPath);
});
}
private static void exec(SparkSession spark, String output_path, String input_path) {
Dataset<Result> datacite = Utils
.readPath(
spark, input_path, Result.class)
.filter(
(FilterFunction<Result>) r -> r.getId().startsWith("50|datacite"));
datacite.write().option("compression", "gzip").mode(SaveMode.Overwrite).json(output_path + "allDatacite");
getProviderResult(output_path, datacite, "Zenodo");
getProviderResult(output_path, datacite, "Figshare");
getProviderResult(output_path, datacite, "Dryad");
}
private static void getProviderResult(String output_path, Dataset<Result> datacite, String provider) {
datacite
.filter(
(FilterFunction<Result>) r -> r
.getPid()
.stream()
.anyMatch(p -> p.getKey().equals("doi") && p.getValue().contains(provider.toLowerCase())))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(output_path + provider);
}
}

View File

@ -0,0 +1,75 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
public class PrepareNormalizedOrcid {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareNormalizedOrcid.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String inputPath = parser.get("inputPath");
final String outputPath = parser.get("outputPath");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
execNormalize(spark, outputPath, inputPath);
});
}
private static void execNormalize(SparkSession spark, String outputPath, String inputPath) {
Dataset<Orcid> orcid = Utils.readPath(spark, inputPath, Orcid.class);
orcid.map((MapFunction<Orcid, Orcid>) o -> {
o.setName(Utils.normalizeString(o.getName()));
o.setSurname(Utils.normalizeString(o.getSurname()));
o.setCreditname(Utils.normalizeString(o.getCreditname()));
o
.setOtherNames(
o
.getOtherNames()
.stream()
.map(on -> Utils.normalizeString(on))
.collect(Collectors.toList()));
return o;
}, Encoders.bean(Orcid.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}

View File

@ -0,0 +1,89 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Author;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
public class PrepareNormalizedResultSpark {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareNormalizedResultSpark.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String inputPath = parser.get("inputPath");
final String outputPath = parser.get("outputPath");
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
execNormalize(spark, outputPath, inputPath);
});
}
private static void execNormalize(SparkSession spark, String outputPath, String inputPath) {
Dataset<Author> normalized_result = Utils
.readPath(spark, inputPath + "publicationsWithOrcid", Author.class)
.union(Utils.readPath(spark, inputPath + "datasetWithOrcid", Author.class))
.union(Utils.readPath(spark, inputPath + "softwareWithOrcid", Author.class))
.union(Utils.readPath(spark, inputPath + "otherWithOrcid", Author.class))
.map((MapFunction<Author, Author>) r -> {
r.setName(Utils.normalizeString(r.getName()));
r.setSurname(Utils.normalizeString(r.getSurname()));
r.setFullname(Utils.normalizeString(r.getFullname()));
return r;
}, Encoders.bean(Author.class));
normalized_result
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath + "ResultAuthorNormalized");
normalized_result
.filter((FilterFunction<Author>) r -> !r.getId().startsWith("50|dedup"))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath + "collectedResultWithOrcid");
normalized_result
.filter((FilterFunction<Author>) r -> !r.getDeletedbyinference())
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath + "notDeletedByInferenceResultWithOrcid");
}
}

View File

@ -0,0 +1,103 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
import scala.Tuple2;
public class PrepareResultAllTheRestSpark {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultAllTheRestSpark.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/prepare_alltherest_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String inputPath = parser.get("inputPath");
final String outputPath = parser.get("outputPath");
final String instRepoPath = parser.get("instRepoPath");
final String crossrefPath = parser.get("crossrefPath");
final String datacitePath = parser.get("datacitePath");
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath + "allTheRest");
exec(
spark, outputPath + "allTheRest",
inputPath, instRepoPath,
datacitePath, crossrefPath);
});
}
/**
* Leggo tutti i result di crossref, datacite ed associati agli institutional repositories
* Leggo tutti i result collezionati
* faccio una left join tra i result collezionati e quelli letti al passo precedente
* prendo quelli che non hanno un match nella join
* @param spark
* @param output_path
* @param result_path
*/
private static void exec(SparkSession spark, String output_path, String result_path, String inst_repo_path,
String datacite_path, String crossref_path) {
Dataset<Result> result = Utils.readPath(spark, result_path, Result.class);
Dataset<Result> inst_repo = Utils
.readPath(spark, inst_repo_path, Result.class);
Dataset<Result> datacite = Utils
.readPath(spark, datacite_path, Result.class);
Dataset<Result> crossref = Utils
.readPath(spark, crossref_path, Result.class);
Dataset<Result> union_dataset = inst_repo.union(datacite).union(crossref);
result
.joinWith(union_dataset, result.col("id").equalTo(union_dataset.col("id")), "left")
.map((MapFunction<Tuple2<Result, Result>, Result>) t2 -> {
if (!Optional.ofNullable(t2._2()).isPresent())
return t2._1();
return null;
}, Encoders.bean(Result.class))
.filter(Objects::nonNull)
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(output_path);
}
}

View File

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
import eu.dnetlib.dhp.schema.oaf.Datasource;
public class PrepareResultFromInstRepo {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultFromInstRepo.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/prepare_instrepo_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String inputPath = parser.get("inputPath");
final String outputPath = parser.get("outputPath");
final String datasourcePath = parser.get("datasourcePath");
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
selectResultFromInstRepo(spark, inputPath, outputPath, datasourcePath);
});
}
private static void selectResultFromInstRepo(SparkSession spark, String inputPath, String output_path,
String datasourcePath) {
Dataset<Datasource> datasource = Utils.readPath(spark, datasourcePath, Datasource.class);
Dataset<Result> res = Utils
.readPath(
spark, inputPath, Result.class)
.filter(
(FilterFunction<Result>) r -> !r.getId().startsWith("50|doiboost")
&& !r.getId().startsWith("50|scholix")
&& !r.getId().startsWith("50|datacite")
&& !r.getId().startsWith("50|dedup"));
datasource.createOrReplaceTempView("datasource");
res.createOrReplaceTempView("result");
spark
.sql(
"SELECT t.id, t.deletedbyinference, t.name, t.surname, t.cf, t.fullname, t.pid, t.oid " +
"FROM " +
"(Select * " +
"from result " +
"LATERAL VIEW explode(cf.key) c as cfromkey) as t " +
"join " +
"datasource d " +
"on " +
"d.id = t.cfromkey " +
"and d.datasourcetype.classid = 'pubsrepository::institutional'")
.as(Encoders.bean(Result.class))
.write()
.option("compressio", "gzip")
.mode(SaveMode.Overwrite)
.json(output_path);
}
}

View File

@ -0,0 +1,126 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Author;
import eu.dnetlib.dhp.ircdl_extention.model.KeyValue;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class PrepareResultSpark {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultSpark.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/prepare_result_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String resultClassName = parser.get("resultClass");
Class<? extends eu.dnetlib.dhp.schema.oaf.Result> resultClazz = (Class<? extends eu.dnetlib.dhp.schema.oaf.Result>) Class
.forName(resultClassName);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String inputPath = parser.get("inputPath");
final String outputPath = parser.get("outputPath");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
mapToResult(spark, inputPath, resultClazz, outputPath);
});
}
private static <R extends eu.dnetlib.dhp.schema.oaf.Result> void mapToResult(SparkSession spark,
String input_path,
Class<R> resultClazz, String output_path) {
Dataset<R> publicationDataset = Utils.readPath(spark, input_path, resultClazz);
Dataset<R> result = publicationDataset.filter((FilterFunction<R>) p -> {
if (p.getAuthor() == null)
return false;
if (p.getAuthor().size() == 0)
return false;
return true;
});
result.flatMap((FlatMapFunction<R, Author>) p -> {
List<Author> reslist = new ArrayList<>();
p.getAuthor().forEach(a -> {
Author r = new Author();
r.setDeletedbyinference(p.getDataInfo().getDeletedbyinference());
r.setId(p.getId());
r
.setCf(
p
.getCollectedfrom()
.stream()
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
.collect(Collectors.toList()));
r.setName(a.getName());
r.setSurname(a.getSurname());
r.setFullname(a.getFullname());
r
.setPid(
p
.getPid()
.stream()
.map(
pid -> KeyValue
.newInstance(pid.getQualifier().getClassid(), pid.getValue()))
.collect(Collectors.toList()));
r
.setApid(
Optional
.ofNullable(a.getPid())
.map(
pids -> pids
.stream()
.map(pd -> KeyValue.newInstance(pd.getQualifier().getClassid(), pd.getValue()))
.collect(Collectors.toList()))
.orElse(new ArrayList<>()));
reslist.add(r);
});
return reslist.iterator();
}, Encoders.bean(Author.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(output_path);
}
}

View File

@ -0,0 +1,78 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Author;
import eu.dnetlib.dhp.ircdl_extention.model.KeyValue;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class SelectAuthorWithOrcidOnlySpark {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareResultSpark.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/prepare_result_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String inputPath = parser.get("inputPath");
final String outputPath = parser.get("outputPath");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
selectAuthors(spark, inputPath, outputPath);
});
}
private static void selectAuthors(SparkSession spark, String input_path, String output_path) {
Dataset<Author> resultDataset = Utils.readPath(spark, input_path, Author.class);
resultDataset.flatMap((FlatMapFunction<Author, Result>) p -> {
List<Result> reslist = new ArrayList<>();
p.getApid().forEach(a -> {
if (a.getKey().equals(ModelConstants.ORCID_PENDING) || a.getKey().equals(ModelConstants.ORCID)) {
Result r = Result.fromAuthor(p);
r.setOid(a.getValue());
reslist.add(r);
}
});
return reslist.iterator();
}, Encoders.bean(Result.class))
.write()
.option("compressio", "gzip")
.mode(SaveMode.Overwrite)
.json(output_path);
}
}

View File

@ -0,0 +1,268 @@
package eu.dnetlib.dhp.ircdl_extention;
import java.io.Serializable;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.CosineDistance;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
import scala.Tuple2;
public class Utils implements Serializable {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final Logger log = LoggerFactory
.getLogger(eu.dnetlib.dhp.ircdl_extention.Utils.class);
public static String normalizeString(String input) {
if (input == null || input.equals("void"))
return new String();
String tmp = Normalizer
.normalize(input, Normalizer.Form.NFKD)
.replaceAll("[^\\p{ASCII}]", "");
tmp = tmp
.replaceAll("[^\\p{Alpha}]+", " ")
.replaceAll("\\s+", " ")
.trim();
return tmp;
}
public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
private static List<String> getList(List<String> input) {
return input
.stream()
.map(st -> st.trim())
.filter(st -> st.length() > 0)
.sorted()
.collect(Collectors.toList());
}
private static List<String> getListInitials(List<String> input) {
List<String> ret = new ArrayList<>();
List<Character> tmp = input
.stream()
.map(st -> st.trim())
.filter(st -> st.length() > 0)
.map(st -> st.charAt(0))
.sorted()
.collect(Collectors.toList());
if (tmp.size() == 1)
ret.add(String.valueOf(tmp.get(0)));
for (int i = 0; i < tmp.size(); i++) {
for (int j = i + 1; j < tmp.size(); j++) {
ret.add(String.valueOf(tmp.get(i)) + String.valueOf(tmp.get(j)));
}
}
return ret;
}
// selezione delle coppie di primi caratteri per ogni parola che compone il nome
// se ci sono match il nome e' giusto
// aggiungere verifica che la lunghezza delle liste non sia troppo sbilanciata: se una lista e' lunga
public static boolean conservativeFilterFunction(Tuple2<Result, Orcid> input) {
List<String> res = getListInitials(Arrays.asList(input._1().getFullname().split(" ")));
Orcid or = input._2();
List<String> tmp = new ArrayList<>();
Collections.addAll(tmp, or.getName().split(" "));
Collections.addAll(tmp, or.getSurname().split(" "));
return checkContains(
res, getListInitials(tmp), false)
||
checkContains(
res, getListInitials(Arrays.asList(or.getCreditname().split(" "))), false)
||
or
.getOtherNames()
.stream()
.anyMatch(
on -> checkContains(
res, getListInitials(Arrays.asList(on.split(" "))), false));
}
public static boolean filterFunction(Tuple2<Result, Orcid> input) throws JsonProcessingException {
try {
List<String> res = getList(Arrays.asList(input._1().getFullname().split(" ")));
Orcid or = input._2();
List<String> tmp = new ArrayList<>();
Collections.addAll(tmp, or.getName().split(" "));
Collections.addAll(tmp, or.getSurname().split(" "));
return checkContains(
res, getList(tmp)
.stream()
.sorted()
.collect(Collectors.toList()))
||
checkContains(
res, getList(Arrays.asList(or.getCreditname().split(" ")))
.stream()
.sorted()
.collect(Collectors.toList()))
||
or
.getOtherNames()
.stream()
.anyMatch(
on -> checkContains(
res, getList(Arrays.asList(on.split(" ")))
.stream()
.sorted()
.collect(Collectors.toList())));
} catch (Exception e) {
log.info("EXCEPTIONNNN: " + new ObjectMapper().writeValueAsString(input));
throw e;
}
}
private static boolean checkContains(List<String> result, List<String> orcid) {
return checkContains(result, orcid, true);
}
private static boolean checkContains(List<String> result, List<String> orcid, boolean jaro) {
if (result.size() == 0 || orcid.size() == 0) {
return true;
}
String[][] input = {
{
"1", StringUtils.joinWith(" ", result)
},
{
"2", StringUtils.joinWith(" ", orcid)
}
};
// exact match word by word
Double cosineDistance = new CosineDistance().apply(input[0][1], input[1][1]);
if (Math.round((1 - cosineDistance) * 100) == 100) {
return true;
}
// check containment one list can be greater than the other, and also composition of words to create the name
// e.g. pengli yan = li peng yan
if (orcid.size() < result.size()) {
if (isIn(orcid, result))
return true;
} else {
if (isIn(result, orcid))
return true;
}
if (jaro) {
// apply JaroWinkler distance
double score = new JaroWinkler()
.score(StringUtils.joinWith(" ", result), StringUtils.joinWith(" ", orcid));
return score > 0.95;
}
return false;
}
private static boolean isIn(List<String> lst1, List<String> lst2) {
int index = 0;
for (String word : lst1) {
int i = index;
boolean found = false;
while (i < lst2.size()) {
String wordlist = lst2.get(i);
if (word.equals(wordlist)) {
index = i + 1;
i = lst2.size();
found = true;
} else {
if (word.charAt(0) < wordlist.charAt(0)) {
if (!checkComposition(word, lst2)) {
return false;
} else {
index = 0;
i = lst2.size();
found = true;
}
} else {
if (word.length() == 1 || wordlist.length() == 1) {
if (word.charAt(0) == wordlist.charAt(0)) {
index = i + 1;
i = lst2.size();
found = true;
} else {
i++;
}
} else {
i++;
}
}
}
}
if (!found) {
if (!checkComposition(word, lst2)) {
return false;
} else {
index = 0;
}
}
}
return true;
}
private static boolean checkComposition(String word, List<String> lst2) {
for (int i = 0; i < lst2.size(); i++) {
for (int j = 0; j < lst2.size(); j++) {
if (i != j) {
String w = lst2.get(i) + lst2.get(j);
if (word.equals(w)) {
if (i > j) {
lst2.remove(i);
lst2.remove(j);
} else {
lst2.remove(j);
lst2.remove(i);
}
return true;
}
}
}
}
return false;
}
}

View File

@ -0,0 +1,174 @@
package eu.dnetlib.dhp.ircdl_extention;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapGroupsFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.core.JsonProcessingException;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.ircdl_extention.model.Author;
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
import eu.dnetlib.dhp.ircdl_extention.model.ShuffleInfo;
import scala.Tuple2;
public class WrongSpark {
/**
* takes as input the orcid normalized and the entry normalized to be checked against orcid
* returns the lower bound of wrong attribution
*/
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
WrongSpark.class
.getResourceAsStream(
"/eu/dnetlib/dhp/ircdl_extention/wrong_orcid_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String orcidPath = parser.get("orcidPath");
final String outputPath = parser.get("outputPath");
final String resultPath = parser.get("inputPath");
final String authorPath = parser.get("authorPath");
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", "thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083");
runWithSparkHiveSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
findWrong(spark, orcidPath, outputPath + "/wrong", resultPath);
findShuffle(spark, orcidPath, outputPath + "/shuffle", resultPath, authorPath);
});
}
private static void findShuffle(SparkSession spark, String orcidPath, String outputPath, String resultPath,
String authorPath) {
Utils
.readPath(spark, authorPath, Author.class)
.map(
(MapFunction<Author, ShuffleInfo>) r -> ShuffleInfo
.newInstance(r.getName(), r.getSurname(), r.getFullname(), r.getId()),
Encoders.bean(ShuffleInfo.class))
.union(
getWrong(spark, orcidPath, resultPath)
.map((MapFunction<Tuple2<Result, Orcid>, ShuffleInfo>) t2 ->
ShuffleInfo
.newInstance(
t2._1().getName(), t2._1().getSurname(), t2._1().getFullname(),
t2._1().getId(), t2._2().getName(), t2._2().getSurname(),
t2._2().getCreditname(), t2._2().getOtherNames(), t2._2().getOrcid()),
Encoders.bean(ShuffleInfo.class)))
.groupByKey((MapFunction<ShuffleInfo, String>) si -> si.getId(), Encoders.STRING())
.flatMapGroups((FlatMapGroupsFunction<String, ShuffleInfo, ShuffleInfo>) (s, it) -> {
List<ShuffleInfo> shuffleInfoList = new ArrayList();
List<ShuffleInfo> ret = new ArrayList<>();
shuffleInfoList.add(it.next());
it.forEachRemaining(e -> shuffleInfoList.add(e));
shuffleInfoList
.stream()
.filter(e -> Optional.ofNullable(e.getOrcid()).isPresent())
.forEach(e -> {
if (checkShuffle(e, shuffleInfoList))
ret.add(e);
});
return ret.iterator();
}, Encoders.bean(ShuffleInfo.class))
.filter(Objects::nonNull)
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
/*
* def checkShuffle(x): alis = [a for a in x[1]] dic = {} count = 0 for entry in alis: if entry['orcid'] != '':
* dic[entry['orcid']] = entry for orcid in dic: name = dic[orcid]['oname'] surname = dic[orcid]['osurname'] for
* author in alis: if author['aname'] == "" or author['asurname'] == "": if checkContains([author['afullname']],
* addInListAll([], name + " " + surname)): count += 1 break else: if checkContains([author['aname'] + " " +
* author['asurname']], addInListAll([], name + " " + surname)): count += 1 break return count
*/
// candidate_shuffle = zenodo_normalized.map(lambda x: (x['id'], {'aname':x['name'], 'asurname': x['surname'],
// 'afullname': x['fullname'], 'oname':"", 'osurname':"", 'orcid':''})). union (
// join_orcid_filtered.map(lambda e: (e['id'], {'aname':e['nameg'], 'asurname':e['surnameg'],
// 'afullname':e['fullnameg'], 'oname':e['name'],
// 'osurname':e['surname'],'orcid':e['orcid']}))).groupByKey().filter(toBeChecked)
}
private static boolean checkShuffle(ShuffleInfo e, List<ShuffleInfo> shuffleInfoList) {
boolean b = shuffleInfoList
.stream()
.anyMatch(
si -> {
try {
return Utils
.filterFunction(
new Tuple2<>(Result.newInstance(si.getAfullname()),
Orcid
.newInstance(
e.getOname(), e.getOsurname(), e.getOcreditName(),
e.getoOtherNames())));
} catch (JsonProcessingException ex) {
throw new RuntimeException(ex);
}
});
return b;
}
private static Dataset<Tuple2<Result, Orcid>> getWrong(SparkSession spark, String orcidPath, String resultPath) {
Dataset<Orcid> orcidDataset = Utils
.readPath(spark, orcidPath, Orcid.class)
.filter((FilterFunction<Orcid>) o -> !o.getName().contains("deactivated"));
Dataset<Result> resultDataset = Utils.readPath(spark, resultPath, Result.class);
return resultDataset
.joinWith(
orcidDataset, resultDataset
.col("oid")
.equalTo(orcidDataset.col("orcid")),
"inner")
.filter((FilterFunction<Tuple2<Result, Orcid>>) t2 -> !Utils.conservativeFilterFunction(t2));
}
private static void findWrong(SparkSession spark, String orcidPath, String outputPath, String resultPath) {
getWrong(spark, orcidPath, resultPath)
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}

View File

@ -0,0 +1,18 @@
package eu.dnetlib.dhp.ircdl_extention.model;
import java.io.Serializable;
import java.util.List;
public class Author extends Result implements Serializable {
private List<KeyValue> apid;
public List<KeyValue> getApid() {
return apid;
}
public void setApid(List<KeyValue> apid) {
this.apid = apid;
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.ircdl_extention.model;
import java.io.Serializable;
public class KeyValue implements Serializable {
private String key;
private String value;
public static KeyValue newInstance(String key, String value) {
KeyValue kv = new KeyValue();
kv.key = key;
kv.value = value;
return kv;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -0,0 +1,89 @@
package eu.dnetlib.dhp.ircdl_extention.model;
import java.io.Serializable;
import java.util.List;
public class Orcid implements Serializable {
private List<String> otherNames;
private String inception;
private String surname;
private String mode;
private String creditname;
private String orcid;
private Boolean works;
private String name;
public static Orcid newInstance(String oname, String osurname, String ocreditName, List<String> oOtherNames) {
Orcid o = new Orcid();
o.name = oname;
o.surname = osurname;
o.creditname = ocreditName;
o.otherNames = oOtherNames;
return o;
}
public List<String> getOtherNames() {
return otherNames;
}
public void setOtherNames(List<String> otherNames) {
this.otherNames = otherNames;
}
public String getInception() {
return inception;
}
public void setInception(String inception) {
this.inception = inception;
}
public String getSurname() {
return surname;
}
public void setSurname(String surname) {
this.surname = surname;
}
public String getMode() {
return mode;
}
public void setMode(String mode) {
this.mode = mode;
}
public String getCreditname() {
return creditname;
}
public void setCreditname(String creditname) {
this.creditname = creditname;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String oid) {
this.orcid = oid;
}
public Boolean getWorks() {
return works;
}
public void setWorks(Boolean works) {
this.works = works;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}

View File

@ -0,0 +1,109 @@
package eu.dnetlib.dhp.ircdl_extention.model;
import java.io.Serializable;
import java.util.List;
public class Result implements Serializable {
private Boolean deletedbyinference;
private String id;
private List<KeyValue> cf;
private List<KeyValue> pid;
private String name;
private String surname;
private String fullname;
private String oid;
public static Result newInstance(String afullname) {
Result r = new Result();
r.fullname = afullname;
return r;
}
public static Result fromAuthor(Author p) {
Result r = new Result();
r.deletedbyinference = p.getDeletedbyinference();
r.id = p.getId();
r.cf = p.getCf();
r.pid = p.getPid();
r.name = p.getName();
r.surname = p.getSurname();
r.fullname = p.getFullname();
return r;
}
public Boolean getDeletedbyinference() {
return deletedbyinference;
}
public void setDeletedbyinference(Boolean deletedbyinference) {
this.deletedbyinference = deletedbyinference;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<KeyValue> getCf() {
return cf;
}
public void setCf(List<KeyValue> cf) {
this.cf = cf;
}
public List<KeyValue> getPid() {
return pid;
}
public void setPid(List<KeyValue> pid) {
this.pid = pid;
}
public String getName() {
return name;
}
public void setName(String name) {
if (name != null)
this.name = name.toLowerCase();
else
this.name = new String();
}
public String getSurname() {
return surname;
}
public void setSurname(String surname) {
if (surname != null)
this.surname = surname.toLowerCase();
else
this.surname = new String();
}
public String getFullname() {
return fullname;
}
public void setFullname(String fullname) {
if (fullname != null)
this.fullname = fullname.toLowerCase();
else
this.fullname = new String();
}
public String getOid() {
return oid;
}
public void setOid(String oid) {
this.oid = oid;
}
}

View File

@ -0,0 +1,125 @@
package eu.dnetlib.dhp.ircdl_extention.model;
import java.io.Serializable;
import java.util.List;
public class ShuffleInfo implements Serializable {
private String aname;
private String asurname;
private String afullname;
private String oname;
private String osurname;
private String ocreditName;
private List<String> oOtherNames;
private String orcid;
private String id;
private String pid;
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public String getAname() {
return aname;
}
public void setAname(String aname) {
this.aname = aname;
}
public String getAsurname() {
return asurname;
}
public void setAsurname(String asurname) {
this.asurname = asurname;
}
public String getAfullname() {
return afullname;
}
public void setAfullname(String afullname) {
this.afullname = afullname;
}
public String getOname() {
return oname;
}
public void setOname(String oname) {
this.oname = oname;
}
public String getOsurname() {
return osurname;
}
public void setOsurname(String osurname) {
this.osurname = osurname;
}
public String getOcreditName() {
return ocreditName;
}
public void setOcreditName(String ocreditName) {
this.ocreditName = ocreditName;
}
public List<String> getoOtherNames() {
return oOtherNames;
}
public void setoOtherNames(List<String> oOtherNames) {
this.oOtherNames = oOtherNames;
}
public String getOrcid() {
return orcid;
}
public void setOrcid(String orcid) {
this.orcid = orcid;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public static ShuffleInfo newInstance(String aname, String asurname, String afullname, String id) {
ShuffleInfo si = new ShuffleInfo();
si.afullname = afullname;
si.aname = aname;
si.asurname = asurname;
si.id = id;
return si;
}
public static ShuffleInfo newInstance(String aname, String asurname, String afullname, String id, String oname,
String osurname, String ocredtname, List<String> oOthername, String orcid, String pid) {
ShuffleInfo si = new ShuffleInfo();
si.afullname = afullname;
si.aname = aname;
si.asurname = asurname;
si.id = id;
si.oname = oname;
si.osurname = osurname;
si.ocreditName = ocredtname;
si.oOtherNames = oOthername;
si.orcid = orcid;
si.pid = pid;
return si;
}
}

View File

@ -0,0 +1,28 @@
TUBYDI - Assistir Filmes e Series Online Grátis
123Movies
WATCH FULL MOVIE
Movierulz
Full Movie Online
MOVIé WatcH
The King of Staten Island 2020 Online For Free
Watch Train to Busan 2 2020 online for free
Sixth Sense Movie Novelization
Film Complet streaming vf gratuit en ligne
watch now free
LIVE stream watch
LIVE stream UFC
RBC Heritage live stream
MLBStreams Free
NFL Live Stream
Live Stream Free
Royal Ascot 2020 Live Stream
TV Shows Full Episodes Official
FuboTV
Gomovies
Online Free Trial Access
123watch
DÜŞÜK HAPI
Bebek Düşürme Yöntemleri
WHATSAP İLETİŞİM
Cytotec
düşük hapı

View File

@ -0,0 +1,21 @@
[
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source mdstore path",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the target mdstore path",
"paramRequired": true
},
{
"paramName": "m",
"paramLongName": "master",
"paramDescription": "the master name",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source mdstore path",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the target mdstore path",
"paramRequired": true
},
{
"paramName": "m",
"paramLongName": "master",
"paramDescription": "the master name",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "isLookupUrl",
"paramDescription": "the isLookup URL",
"paramRequired": true
}
]

View File

@ -0,0 +1,23 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,46 @@
<workflow-app name="Import_Datacite_and_transform_to_OAF" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>mainPath</name>
<description>the working path of Datacite stores</description>
</property>
<property>
<name>isLookupUrl</name>
<description>The IS lookUp service endopoint</description>
</property>
</parameters>
<start to="TransformJob"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="TransformJob">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>TransformJob</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
<arg>--targetPath</arg><arg>${mainPath}/production/datacite_oaf</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,23 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,46 @@
<workflow-app name="Datacite_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the working path of Datacite stores</description>
</property>
<property>
<name>outputPath</name>
<description>the path of Datacite ActionSet</description>
</property>
</parameters>
<start to="ExportDataset"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ExportDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExportDataset</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${outputPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,58 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration>

View File

@ -0,0 +1,516 @@
<workflow-app name="IRCDL Extention" xmlns="uri:oozie:workflow:0.5">
<start to="deleteoutputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="deleteoutputpath">
<fs>
<delete path='${outputPath}'/>
<mkdir path='${outputPath}'/>
<delete path='${workingDir}'/>
<mkdir path='${workingDir}'/>
</fs>
<ok to="fork_prepare"/>
<error to="Kill"/>
</action>
<fork name="fork_prepare">
<path start="fork_prepare_result"/>
<path start="prepare_orcid"/>
</fork>
<action name="prepare_orcid">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResult</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareNormalizedOrcid</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${orcidInputPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
</spark>
<ok to="join_fork"/>
<error to="Kill"/>
</action>
<fork name="fork_prepare_result">
<path start="prepare_publication"/>
<path start="prepare_dataset"/>
<path start="prepare_software"/>
<path start="prepare_other"/>
</fork>
<action name="prepare_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResult</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
<arg>--resultClass</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/publicationsWithOrcid</arg>
</spark>
<ok to="wait_prepare_result"/>
<error to="Kill"/>
</action>
<action name="prepare_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResult</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
<arg>--resultClass</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/datasetWithOrcid</arg>
</spark>
<ok to="wait_prepare_result"/>
<error to="Kill"/>
</action>
<action name="prepare_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResult</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
<arg>--resultClass</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/softwareWithOrcid</arg>
</spark>
<ok to="wait_prepare_result"/>
<error to="Kill"/>
</action>
<action name="prepare_other">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResult</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
<arg>--resultClass</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/otherWithOrcid</arg>
</spark>
<ok to="wait_prepare_result"/>
<error to="Kill"/>
</action>
<join name="wait_prepare_result" to="normalize_result"/>
<action name="normalize_result">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResult</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareNormalizedResultSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/Normalized/</arg>
</spark>
<ok to="select_only_author_with_orcid"/>
<error to="Kill"/>
</action>
<action name="select_only_author_with_orcid">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResult</name>
<class>eu.dnetlib.dhp.ircdl_extention.SelectAuthorWithOrcidOnlySpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
</spark>
<ok to="fork_get_result_info"/>
<error to="Kill"/>
</action>
<fork name="fork_get_result_info">
<path start="get_result_instrepo"/>
<path start="get_result_datacite"/>
<path start="get_result_crossref"/>
</fork>
<action name="get_result_instrepo">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultFromInstRepo</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/InstRepo/</arg>
<arg>--datasourcePath</arg><arg>${datasourcePath}</arg>
</spark>
<ok to="wait_res_info"/>
<error to="Kill"/>
</action>
<action name="get_result_datacite">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareDataciteSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/Datacite/</arg>
</spark>
<ok to="wait_res_info"/>
<error to="Kill"/>
</action>
<action name="get_result_crossref">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareCrossrefSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/Crossref/</arg>
</spark>
<ok to="wait_res_info"/>
<error to="Kill"/>
</action>
<join name="wait_res_info" to="get_result_alltherest"/>
<action name="get_result_alltherest">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.PrepareResultAllTheRestSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Normalized/ResultWithOrcid/</arg>
<arg>--outputPath</arg><arg>${workingDir}/GRAPH/AllTheRest/</arg>
<arg>--instRepoPath</arg><arg>${workingDir}/GRAPH/InstRepo/</arg>
<arg>--datacitePath</arg><arg>${workingDir}/GRAPH/Datacite/</arg>
<arg>--crossrefPath</arg><arg>${workingDir}/GRAPH/Crossref/</arg>
</spark>
<ok to="join_fork"/>
<error to="Kill"/>
</action>
<join name="join_fork" to="fork_get_wrong"/>
<fork name="fork_get_wrong">
<path start="get_wrong_instrepo"/>
<path start="get_wrong_datacite"/>
<path start="get_wrong_crossref"/>
<path start="get_wrong_alltherest"/>
<path start="get_wrong_zenodo"/>
<path start="get_wrong_figshare"/>
<path start="get_wrong_dryad"/>
</fork>
<action name="get_wrong_instrepo">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/InstRepo/</arg>
<arg>--outputPath</arg><arg>${outputPath}/InstRepo/</arg>
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
</spark>
<ok to="jojn_wrong"/>
<error to="Kill"/>
</action>
<action name="get_wrong_datacite">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Datacite/allDatacite/</arg>
<arg>--outputPath</arg><arg>${outputPath}/Datacite/</arg>
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
</spark>
<ok to="jojn_wrong"/>
<error to="Kill"/>
</action>
<action name="get_wrong_crossref">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Crossref/</arg>
<arg>--outputPath</arg><arg>${outputPath}/Crossref/</arg>
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
</spark>
<ok to="jojn_wrong"/>
<error to="Kill"/>
</action>
<action name="get_wrong_alltherest">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/AllTheRest/</arg>
<arg>--outputPath</arg><arg>${outputPath}/AllTheRest/</arg>
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
</spark>
<ok to="jojn_wrong"/>
<error to="Kill"/>
</action>
<action name="get_wrong_zenodo">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Datacite/Zenodo/</arg>
<arg>--outputPath</arg><arg>${outputPath}/Zenodo/</arg>
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
</spark>
<ok to="jojn_wrong"/>
<error to="Kill"/>
</action>
<action name="get_wrong_figshare">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Datacite/Figshare/</arg>
<arg>--outputPath</arg><arg>${outputPath}/Figshare/</arg>
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
</spark>
<ok to="jojn_wrong"/>
<error to="Kill"/>
</action>
<action name="get_wrong_dryad">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GetResultInstRepo</name>
<class>eu.dnetlib.dhp.ircdl_extention.WrongSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/GRAPH/Datacite/Dryad/</arg>
<arg>--outputPath</arg><arg>${outputPath}/Dryad/</arg>
<arg>--orcidPath</arg><arg>${workingDir}/ORCID/entrySetMayNormalized/</arg>
<arg>--ap</arg><arg>${workingDir}/GRAPH/Normalized/ResultAuthorNormalized/</arg>
</spark>
<ok to="jojn_wrong"/>
<error to="Kill"/>
</action>
<join name="jojn_wrong" to="End"/>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,35 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},{
"paramName": "ir",
"paramLongName": "instRepoPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},{
"paramName": "dp",
"paramLongName": "datacitePath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},{
"paramName": "cp",
"paramLongName": "crossrefPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "dp",
"paramLongName": "datasourcePath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "rc",
"paramLongName": "resultClass",
"paramDescription": "the path of the new ActionSet",
"paramRequired": false
}
]

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "op",
"paramLongName": "orcidPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "thepath of the new ActionSet",
"paramRequired": true
},
{
"paramName": "ap",
"paramLongName": "authorPath",
"paramDescription": "thepath of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,50 @@
package eu.dentlib.dhp.aggregation;
import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import org.apache.commons.io.IOUtils;
import org.mockito.Mock;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public abstract class AbstractVocabularyTest {
@Mock
protected ISLookUpService isLookUpService;
protected VocabularyGroup vocabularies;
public void setUpVocabulary() throws ISLookUpException, IOException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
}
private static List<String> vocs() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/vocabulary/terms.txt")));
}
private static List<String> synonyms() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/vocabulary/synonyms.txt")));
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dentlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import org.codehaus.jackson.map.ObjectMapper
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class DataciteToOAFTest extends AbstractVocabularyTest{
@BeforeEach
def setUp() :Unit = {
super.setUpVocabulary()
}
@Test
def testMapping() :Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("datacite.json")).mkString
val mapper = new ObjectMapper()
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies )
println (mapper.defaultPrettyPrintingWriter().writeValueAsString(res.head))
}
@Test
def testDate():Unit = {
println(DataciteToOAFTransformation.fix_thai_date("01-01-2561","[dd-MM-yyyy]"))
println(DataciteToOAFTransformation.fix_thai_date("2561-01-01","[yyyy-MM-dd]"))
}
}

View File

@ -0,0 +1,98 @@
package eu.dnetlib.dhp.ircdl_extention;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
import eu.dnetlib.dhp.actionmanager.project.SparkUpdateProjectTest;
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Project;
public class NormalizeOrcidTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ClassLoader cl = eu.dnetlib.dhp.ircdl_extention.NormalizeOrcidTest.class
.getClassLoader();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(eu.dnetlib.dhp.ircdl_extention.NormalizeOrcidTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(eu.dnetlib.dhp.ircdl_extention.NormalizeOrcidTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(eu.dnetlib.dhp.ircdl_extention.NormalizeOrcidTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
// conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
// conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(NormalizeOrcidTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void normalizeOrcid() throws Exception {
PrepareNormalizedOrcid
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
getClass()
.getResource(
"/eu/dnetlib/dhp/ircdl_extention/orcid_original.json")
.getPath(),
"-outputPath",
workingDir.toString() + "/orcidNormalized"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Orcid> tmp = sc
.textFile(workingDir.toString() + "/orcidNormalized")
.map(value -> OBJECT_MAPPER.readValue(value, Orcid.class));
tmp.foreach(v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v)));
}
}

View File

@ -0,0 +1,280 @@
package eu.dnetlib.dhp.ircdl_extention;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.neethi.Assertion;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.jayway.jsonpath.WriteContext;
import eu.dnetlib.dhp.ircdl_extention.model.Orcid;
import eu.dnetlib.dhp.ircdl_extention.model.Result;
import eu.dnetlib.dhp.ircdl_extention.model.ShuffleInfo;
import scala.Tuple2;
public class WrongOrcidTest {
@Test
public void wrongOrcidFalse() throws Exception {
Assertions
.assertTrue(
Utils
.filterFunction(
new Tuple2<>(Result.newInstance("veigas pires cristina"),
Orcid
.newInstance(
"cristina", "veiga pires", "c veiga pires",
Arrays.asList("c c veiga pires")))));
}
@Test
public void wrongOrcidFalse2() throws Exception {
Assertions
.assertTrue(
Utils
.filterFunction(
new Tuple2<>(Result.newInstance("yushkevich p"),
Orcid
.newInstance(
"paul", "yushkevich", "paul a yushkevich",
new ArrayList<>()))));
}
@Test
public void wrongOrcidFalse3() throws Exception {
Assertions
.assertTrue(
Utils
.filterFunction(
new Tuple2<>(Result.newInstance("ghosh ss"),
Orcid
.newInstance(
"satrajit", "ghosh",
"satra",
Arrays.asList("satra", "satrajit s ghosh")))));
}
@Test
public void wrongOrcidTrue() throws Exception {
Assertions
.assertFalse(
Utils
.filterFunction(
new Tuple2<>(Result.newInstance("kluft lukas"),
Orcid
.newInstance(
"satrajit", "ghosh",
"satra",
Arrays.asList("satra", "satrajit s ghosh")))));
}
@Test
public void wrongOrcidFalse4() throws Exception {
Assertions
.assertTrue(
Utils
.filterFunction(
new Tuple2<>(Result.newInstance("schulz s a"),
Orcid
.newInstance(
"sebastian", "schulz",
"sebastian a schulz",
new ArrayList<>()))));
}
@Test
public void wrongOrcidFalse5() throws Exception {
Assertions
.assertTrue(
Utils
.filterFunction(
new Tuple2<>(Result.newInstance("domingues af"),
Orcid
.newInstance(
"allysson", "domingues",
"allysson f domingues",
new ArrayList<>()))));
}
@Test
public void wrongOrcidFalseConservative() throws Exception {
Assertions
.assertTrue(
Utils
.conservativeFilterFunction(
new Tuple2<>(Result.newInstance("veigas pires cristina"),
Orcid
.newInstance(
"cristina", "veiga pires", "c veiga pires",
Arrays.asList("c c veiga pires")))));
}
@Test
public void wrongOrcidFalseConservative2() throws Exception {
Assertions
.assertTrue(
Utils
.conservativeFilterFunction(
new Tuple2<>(Result.newInstance("yushkevich p"),
Orcid
.newInstance(
"paul", "yushkevich", "paul a yushkevich",
new ArrayList<>()))));
}
@Test
public void wrongOrcidFalseConservative3() throws Exception {
Assertions
.assertTrue(
Utils
.conservativeFilterFunction(
new Tuple2<>(Result.newInstance("ghosh ss"),
Orcid
.newInstance(
"satrajit", "ghosh",
"satra",
Arrays.asList("satra", "satrajit s ghosh")))));
}
@Test
public void wrongOrcidTrueConservative() throws Exception {
Assertions
.assertFalse(
Utils
.conservativeFilterFunction(
new Tuple2<>(Result.newInstance("kluft lukas"),
Orcid
.newInstance(
"satrajit", "ghosh",
"satra",
Arrays.asList("satra", "satrajit s ghosh")))));
}
@Test
public void wrongOrcidFalseConservative4() throws Exception {
Assertions
.assertTrue(
Utils
.conservativeFilterFunction(
new Tuple2<>(Result.newInstance("schulz s a"),
Orcid
.newInstance(
"sebastian", "schulz",
"sebastian a schulz",
new ArrayList<>()))));
}
@Test
public void wrongOrcidFalseConservative5() throws Exception {
Assertions
.assertTrue(
Utils
.conservativeFilterFunction(
new Tuple2<>(Result.newInstance("domingues af"),
Orcid
.newInstance(
"allysson", "domingues",
"allysson f domingues",
new ArrayList<>()))));
}
@Test
public void wrongOrcidTrueConservative2() throws Exception {
Assertions
.assertFalse(
Utils
.conservativeFilterFunction(
new Tuple2<>(Result.newInstance("figueiredo pontes lorena lobo de"),
Orcid
.newInstance(
"moyses", "soares",
"moyses antonio porto soares",
new ArrayList<>()))));
}
@Test
public void wrongOrcidFalseConservative6() throws Exception {
Assertions
.assertTrue(
Utils
.conservativeFilterFunction(
new Tuple2<>(Result.newInstance("da luz geraldo eduardo"),
Orcid
.newInstance(
"geraldo", "luz jr",
"luz jr g e",
new ArrayList<>()))));
}
@Test
public void testShuffle() throws Exception {
List<ShuffleInfo> shuffleInfoList = new ArrayList<>();
shuffleInfoList
.add(
ShuffleInfo
.newInstance(
"Miriam", "Baglioni", "Miriam Baglioni", "50|fake_1",
"Alessia", "Bardi", "", new ArrayList<String>(), "orcid_alessia"));
shuffleInfoList.add(ShuffleInfo.newInstance("Alessia", "Bardi", "Alessia Bardi", "50|fake_1"));
shuffleInfoList.add(ShuffleInfo.newInstance("Miriam", "Baglioni", "Miriam Baglioni", "50|fake_1"));
shuffleInfoList
.add(
ShuffleInfo
.newInstance(
"Alessia", "Bardi", "Alessia Bardi", "50|fake_1",
"Miriam", "Baglioni", "", new ArrayList<String>(), "orcid_miriam"));
shuffleInfoList.add(ShuffleInfo.newInstance("Claudio", "Atzori", "Claudio Atzori", "50|fake_1"));
List<ShuffleInfo> tmp = shuffleInfoList
.stream()
.filter(e -> Optional.ofNullable(e.getOrcid()).isPresent())
.collect(Collectors.toList());
int count = 0;
for (ShuffleInfo e : tmp) {
if (verifyShuffle(e, shuffleInfoList))
count++;
}
System.out.println(count);
}
private boolean verifyShuffle(ShuffleInfo e, List<ShuffleInfo> shuffleInfoList) {
return shuffleInfoList.stream().anyMatch(si -> {
try {
final Orcid orcid = Orcid
.newInstance(e.getOname(), e.getOsurname(), e.getOcreditName(), e.getoOtherNames());
return Utils
.filterFunction(
new Tuple2<>(Result.newInstance(si.getAfullname()), orcid));
} catch (JsonProcessingException ex) {
ex.printStackTrace();
}
return false;
});
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,29 @@
{"otherNames": [], "inception": "2017-05-22T16:38:30.236Z", "surname": "hyy37", "mode": "Direct", "creditname": "void", "orcid": "0000-0002-8748-6992", "works": false, "name": "1380"}
{"otherNames": [], "inception": "2017-05-25T12:50:48.761Z", "surname": "hyy75", "mode": "Direct", "creditname": "void", "orcid": "0000-0001-7773-1109", "works": false, "name": "2775"}
{"otherNames": [], "inception": "2017-05-28T12:07:09.154Z", "surname": "hyy13", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-4728-6379", "works": false, "name": "434323"}
{"otherNames": [], "inception": "2017-08-10T07:07:23.818Z", "surname": "hyy44", "mode": "Direct", "creditname": "void", "orcid": "0000-0001-9502-3093", "works": false, "name": "58"}
{"otherNames": [], "inception": "2017-08-10T07:08:48.179Z", "surname": "hyy46", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-2933-0057", "works": false, "name": "60"}
{"otherNames": ["pang x y", "pang xueyong"], "inception": "2014-10-13T03:26:21.741Z", "surname": "?", "mode": "API", "creditname": "void", "orcid": "0000-0002-7397-5824", "works": true, "name": "??"}
{"otherNames": [], "inception": "2019-08-27T07:55:06.340Z", "surname": "therasa alphonsa", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0001-7205-6036", "works": false, "name": "a"}
{"otherNames": ["minto"], "inception": "2020-08-02T06:33:18.620Z", "surname": "karim", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0001-6111-6742", "works": false, "name": "a k mohammad fazlul"}
{"otherNames": [], "inception": "2014-05-01T09:13:11.783Z", "surname": "al-sammak", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0001-6646-4295", "works": false, "name": "a-imam"}
{"otherNames": [], "inception": "2019-12-06T12:53:04.045Z", "surname": "hassan", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-2957-4641", "works": false, "name": "a-s.u."}
{"otherNames": [], "inception": "2020-07-28T12:29:26.453Z", "surname": "ajakh", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0002-1081-8426", "works": false, "name": "a."}
{"otherNames": [], "inception": "2017-01-10T12:35:05.016Z", "surname": "antol\u00ednez", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0002-5451-3421", "works": false, "name": "a. (ana)"}
{"otherNames": [], "inception": "2018-08-20T05:00:15.964Z", "surname": "mahmudi", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-3187-941X", "works": false, "name": "a. aviv"}
{"otherNames": [], "inception": "2017-05-13T01:03:58.949Z", "surname": "akanmu", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0001-6223-5428", "works": false, "name": "a. c."}
{"otherNames": [], "inception": "2018-01-20T02:58:05.199Z", "surname": "inci", "mode": "Direct", "creditname": "void", "orcid": "0000-0002-0427-9745", "works": true, "name": "a. can"}
{"otherNames": ["a. kim ryan"], "inception": "2014-10-24T23:06:43.544Z", "surname": "hayes", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0002-2055-8269", "works": true, "name": "a. kim"}
{"otherNames": [], "inception": "2017-08-10T13:38:29.172Z", "surname": "bahadir", "mode": "Direct", "creditname": "void", "orcid": "0000-0002-4045-0001", "works": false, "name": "a. tugba"}
{"otherNames": [], "inception": "2018-08-29T07:49:31.093Z", "surname": "rayna", "mode": "Direct", "creditname": "void", "orcid": "0000-0002-7916-2031", "works": false, "name": "a.brite"}
{"otherNames": [], "inception": "2014-07-12T08:02:39.568Z", "surname": "kalyani", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2649-7126", "works": false, "name": "a.grace"}
{"otherNames": [], "inception": "2018-07-21T12:00:22.042Z", "surname": "ahmed", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-0777-5848", "works": false, "name": "a.i. mahbub uddin"}
{"otherNames": [], "inception": "2018-04-11T13:58:53.355Z", "surname": "a.kathirvel murugan", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2298-6301", "works": false, "name": "a.kathirvel murugan"}
{"otherNames": [], "inception": "2017-08-31T11:35:48.559Z", "surname": "dar", "mode": "Direct", "creditname": "void", "orcid": "0000-0001-8781-6309", "works": false, "name": "a.rashid"}
{"otherNames": [], "inception": "2014-08-26T00:25:30.968Z", "surname": "sayem", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2461-4667", "works": false, "name": "a.s.m."}
{"otherNames": [], "inception": "2019-10-03T01:27:08.212Z", "surname": "conte", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2862-6139", "works": false, "name": "aaron"}
{"otherNames": [], "inception": "2020-03-16T09:37:10.610Z", "surname": "rashmi", "mode": "Direct", "creditname": "void", "orcid": "0000-0003-4754-5465", "works": false, "name": "aarthi rashmi b"}
{"otherNames": [], "inception": "2017-02-28T19:01:59.146Z", "surname": "bhaskar", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0002-5794-1165", "works": false, "name": "aastha"}
{"otherNames": [], "inception": "2020-04-07T18:10:50.922Z", "surname": "belhabib", "mode": "Direct", "creditname": "void", "orcid": "0000-0001-6086-0588", "works": false, "name": "abdelfettah"}
{"otherNames": [], "inception": "2019-01-13T21:50:51.923Z", "surname": "laamani", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2055-2593", "works": false, "name": "abdellatif"}
{"otherNames": ["fákē", "miñhō"], "inception": "2019-01-13T21:50:51.923Z", "surname": "laamani", "mode": "Member-referred", "creditname": "void", "orcid": "0000-0003-2055-2593", "works": false, "name": "abdellatif"}

View File

@ -0,0 +1,4 @@
{"aname":"Miriam", "asurname":"Baglioni", "afullname":"Miriam Baglioni","oname": "Alessia","osurname": "Bardi","ocreditName": "", "oOtherNames": [],"orcid": "orcid_alessia","id": "50|fake1"}
{"aname":"Alessia", "asurname":"Bardi", "afullname":"Alessia Bardi","oname": null,"osurname": null,"ocreditName": null, "oOtherNames": null,"orcid": null,"id": "50|fake1"}
{"aname":"Claudio", "asurname":"Atzori", "afullname":"Claudio Atzori","oname": null,"osurname": null,"ocreditName": null, "oOtherNames": null,"orcid": null,"id": "50|fake1"}
{"aname":"Alessia", "asurname":"Bardi", "afullname":"Alessia Bardi","oname": "Miriam","osurname": "Baglioni","ocreditName": "", "oOtherNames": [],"orcid": "orcid_miriam","id": "50|fake1"}

View File

@ -26,6 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
@ -144,7 +145,7 @@ public class ConversionUtils {
.filter(pid -> pid != null)
.filter(pid -> pid.getQualifier() != null)
.filter(pid -> pid.getQualifier().getClassid() != null)
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
.map(pid -> pid.getValue())
.map(pid -> cleanOrcid(pid))
.filter(StringUtils::isNotBlank)

View File

@ -0,0 +1,91 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-contextpropagation</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-provision-scholexplorer</artifactId>
<version>1.2.4-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,77 @@
package eu.dnetlib.dhp.contextpropagation;
import com.google.common.collect.Maps;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Map;
public class Costants implements Serializable {
private static Map<String, PropagationUse> publicationDatasetSemantics = Maps.newHashMap();
static {
publicationDatasetSemantics.put("issupplementedby", PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
publicationDatasetSemantics.put("cites", PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
publicationDatasetSemantics.put("describes", PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
publicationDatasetSemantics.put("references", PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
publicationDatasetSemantics.put("documents", PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
}
private static Map<String, PropagationUse> datasetDatasetSemantics = Maps.newHashMap();
static{
datasetDatasetSemantics.put("isdescribedby",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("iscitedby",PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("cites",PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("issupplementedby",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("issupplementto",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("iscontinuedby",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("continues",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("hasversion",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("isversionof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("isnewversionof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("ispreviousversionof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("ispartof",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("haspart",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("references",PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("isreferencedby",PropagationUse.newInstance("reuse", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("documents",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("isdocumentedby",PropagationUse.newInstance("latent", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("isvariantformof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("isoriginalformof",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("isidenticalto",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("obsoletes",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
datasetDatasetSemantics.put("isobsoletedby",PropagationUse.newInstance("proxy", "1.0", new HashSet<>()));
}
public static Map<String, PropagationUse> getPublicationDatasetSemantics() {
return publicationDatasetSemantics;
}
public static Map<String, PropagationUse> getDatasetDatasetSemantics() {
return datasetDatasetSemantics;
}
public static boolean containedInPubSem(String sem){
return publicationDatasetSemantics.containsKey(sem);
}
public static boolean containedInDatsSem(String sem){
return datasetDatasetSemantics.containsKey(sem);
}
public static PropagationUse getPublicationValue(String sem){
return publicationDatasetSemantics.get(sem);
}
public static PropagationUse getDatasetValue(String sem){
return datasetDatasetSemantics.get(sem);
}
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.dhp.contextpropagation;
import java.io.Serializable;
import java.util.Map;
public class DatasetPropagationStructure implements Serializable {
private Map<String, PropagationUse> propagation;
public Map<String, PropagationUse> getPropagation() {
return propagation;
}
public void add(String key, PropagationUse value){
propagation.put(key, value);
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.contextpropagation;
import java.io.Serializable;
import java.util.List;
public class Node implements Serializable {
private String id;
private List<String> publisher;
public List<String> getPublisher() {
return publisher;
}
public void setPublisher(List<String> publisher) {
this.publisher = publisher;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public static Node newInstance(String id, List<String> publisher){
Node n = new Node();
n.id = id;
n.publisher = publisher;
return n;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.contextpropagation
import org.apache.spark.sql.{Encoder, Encoders}
import org.apache.spark.sql.expressions.Aggregator
object PropagationAggregator {
def getDatasetAggregator(): Aggregator[(String, PropagationStructure), PropagationStructure, PropagationStructure] = new Aggregator[(String, PropagationStructure), PropagationStructure, PropagationStructure]{
override def zero: PropagationStructure = new PropagationStructure()
override def reduce(b: PropagationStructure, a: (String, PropagationStructure)): PropagationStructure = {
b.mergeFrom(a._2)
}
override def merge(wx: PropagationStructure, wy: PropagationStructure): PropagationStructure = {
wx.mergeFrom(wy)
}
override def finish(reduction: PropagationStructure): PropagationStructure = reduction
override def bufferEncoder: Encoder[PropagationStructure] =
Encoders.kryo(classOf[PropagationStructure])
override def outputEncoder: Encoder[PropagationStructure] =
Encoders.kryo(classOf[PropagationStructure])
}
}

View File

@ -0,0 +1,48 @@
package eu.dnetlib.dhp.contextpropagation;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class PropagationStructure implements Serializable {
private Map<String, List<PropagationUse>> propagation;
public Map<String, List<PropagationUse>> getPropagation() {
return propagation;
}
public void add(String key, List<PropagationUse> value){
propagation.put(key, value);
}
public void setPropagation(Map<String, List<PropagationUse>> propagation) {
this.propagation = propagation;
}
private void mergeList(PropagationUse use, List<PropagationUse> acc){
for(PropagationUse pu: acc){
if (use.getUse().equals(pu.getUse())){
pu.getPath().addAll(use.getPath());
if (Integer.valueOf(pu.getWeight()) < Integer.valueOf(use.getWeight())){
pu.setWeight(use.getWeight());
return;
}
}
}
acc.add(use);
}
public PropagationStructure mergeFrom(PropagationStructure ps){
for(String key : ps.propagation.keySet()){
if (propagation.containsKey(key)){
ps.propagation.get(key).forEach( use -> mergeList(use, propagation.get(key)));
}else{
propagation.put(key, ps.propagation.get(key).stream().map(pu -> PropagationUse.copyInstance(pu)).collect(Collectors.toList()));
}
}
return this;
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.contextpropagation;
import java.io.Serializable;
import java.util.List;
import java.util.Set;
public class PropagationUse implements Serializable {
private String use;
private String weight;
private Set<String> path;
public String getUse() {
return use;
}
public void setUse(String use) {
this.use = use;
}
public String getWeight() {
return weight;
}
public void setWeight(String weight) {
this.weight = weight;
}
public Set<String> getPath() {
return path;
}
public void setPath(Set<String> path) {
this.path = path;
}
public static PropagationUse newInstance(String use, String weight, Set<String> path){
PropagationUse pu = new PropagationUse();
pu.use = use;
pu.weight = weight;
pu.path = path;
return pu;
}
public static PropagationUse copyInstance(PropagationUse use){
PropagationUse pu = new PropagationUse();
pu.path = use.path;
pu.weight = use.weight;
pu.use = use.use;
return pu;
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.contextpropagation;
import java.io.Serializable;
public class RelationPropagation implements Serializable {
private Node source;
private Node target;
private String semantics;
public Node getSource() {
return source;
}
public void setSource(Node source) {
this.source = source;
}
public Node getTarget() {
return target;
}
public void setTarget(Node target) {
this.target = target;
}
public String getSemantics() {
return semantics;
}
public void setSemantics(String semantics) {
this.semantics = semantics;
}
}

View File

@ -0,0 +1,127 @@
package eu.dnetlib.dhp.contextpropagation
import java.util
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixEntityId}
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import scala.collection.JavaConverters._
object SparkContextPropagation {
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertDatasetToJson.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/dataset2Json.json")))
parser.parseArgument(args)
val conf = new SparkConf
val spark = SparkSession.builder.config(conf).appName(SparkConvertDatasetToJson.getClass.getSimpleName).master(parser.get("master")).getOrCreate
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
implicit val mapEncoderPub: Encoder[PropagationStructure] = Encoders.kryo[PropagationStructure]
implicit val mapEncoderDats: Encoder[DatasetPropagationStructure] = Encoders.kryo[DatasetPropagationStructure]
implicit val tupleForPropagation: Encoder[(String, PropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val tupleForPropagationDars: Encoder[(String, DatasetPropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderDats)
implicit val stringEncoder: Encoder[String] = Encoders.STRING
val workingPath = parser.get("workingPath")
def getPublisherList(item: List[ScholixEntityId]) : util.List[String] =
{
item.map(entry => entry.getName).asJava
}
def propagateDataset (item: ((String, PropagationStructure), (String, DatasetPropagationStructure))) : (String, PropagationStructure) = {
val propagation = item._1._2.getPropagation.asScala
val dsprob : DatasetPropagationStructure= item._2._2
val source = dsprob.getPropagation.keySet().iterator().next()
val dic = new scala.collection.mutable.HashMap[String, util.List[PropagationUse]]
propagation.keysIterator.foreach(key => {
val entries = propagation.get(key).get.asScala
entries.foreach(entry => {
if((entry.getUse == dsprob.getPropagation.get(source).getUse || dsprob.getPropagation.get(source).getUse == "proxy")
&& !entry.getPath.contains(source)) {
var new_p = Integer.valueOf(entry.getWeight) * Integer.valueOf(dsprob.getPropagation.get(source).getWeight)
if (new_p > 0.3){
var newentry : PropagationUse = PropagationUse.copyInstance(entry)
newentry.setWeight(String.valueOf(new_p))
newentry.getPath.add(source)
dic(key).add(newentry)
}
}
})
})
var ps: PropagationStructure = new PropagationStructure
ps.setPropagation(dic.asJava)
(source, ps)
}
spark.read.load(s"$workingPath/summary").as[ScholixSummary]
.map(s => new ObjectMapper().writeValueAsString(s))(Encoders.STRING)
.rdd.repartition(500).saveAsTextFile(s"$workingPath/summary_json", classOf[GzipCodec])
val allowedRelations : Dataset[RelationPropagation] = spark.read.load(s"$workingPath/scholix").as[Scholix]
.filter(s => !s.getSource().getDnetIdentifier().substring(0,2).equals("70") )
.filter(s => !s.getTarget().getDnetIdentifier().substring(0,2).equals("70"))
.map(s => {
val rp = new RelationPropagation
rp.setSource(Node.newInstance(s.getSource.getDnetIdentifier, getPublisherList(s.getSource.getPublisher.asScala.toList)))
rp.setTarget(Node.newInstance(s.getTarget.getDnetIdentifier, getPublisherList(s.getTarget.getPublisher.asScala.toList)))
rp.setSemantics(s.getRelationship.getName)
rp
})
val pubs_rel : Dataset[RelationPropagation] = allowedRelations
.filter(r => r.getSource.getId.substring(0,2) == "50"
&& r.getTarget.getId.substring(0,2) == "60"
&& Costants.containedInPubSem(r.getSemantics))
val dats_rel : Dataset[RelationPropagation] = allowedRelations
.filter(r => r.getSource.getId.substring(0,2) == "60"
&& r.getTarget.getId.substring(0,2) == "60"
&& Costants.containedInDatsSem(r.getSemantics)
&& r.getSource.getId != r.getTarget.getId)
val publication_dataset : Dataset[(String, PropagationStructure)] = pubs_rel.map(r => {
val ps = new PropagationStructure
val pv : List[PropagationUse] = List(PropagationUse.copyInstance(Costants.getPublicationValue(r.getSemantics)))
ps.add(r.getSource.getId, pv.asJava)
(r.getTarget.getId, ps)
})
val dataset_dataset : Dataset[(String, DatasetPropagationStructure)] = dats_rel.map(r => {
val ps = new DatasetPropagationStructure
ps.add(r.getTarget.getId, PropagationUse.copyInstance(Costants.getDatasetValue(r.getSemantics)))
(r.getSource.getId, ps)
})
val pl1 : Dataset[(String, PropagationStructure)] = publication_dataset.groupByKey(_._1)
.agg(PropagationAggregator.getDatasetAggregator().toColumn)
val pl2_step1 : Dataset [(String, PropagationStructure)] = pl1.joinWith(dataset_dataset, pl1("_1").equalTo(dataset_dataset("_1")))
.map(propagateDataset)
val pl2 : Dataset [(String, PropagationStructure)] = pl1.union(pl2_step1).groupByKey(_._1)
.agg(PropagationAggregator.getDatasetAggregator().toColumn)
}
}

View File

@ -18,7 +18,7 @@ import eu.dnetlib.dhp.schema.oaf.Field;
public class DatePicker {
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
private static final String DATE_PATTERN = "^(\\d{4})-(\\d{2})-(\\d{2})";
private static final String DATE_DEFAULT_SUFFIX = "01-01";
private static final int YEAR_LB = 1300;
private static final int YEAR_UB = Year.now().getValue() + 5;
@ -28,6 +28,7 @@ public class DatePicker {
final Map<String, Integer> frequencies = dateofacceptance
.parallelStream()
.filter(StringUtils::isNotBlank)
.map(d -> substringBefore(d, "T"))
.collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));
if (frequencies.isEmpty()) {

View File

@ -0,0 +1,44 @@
package eu.dnetlib.dhp.oa.dedup;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collection;
import org.junit.jupiter.api.Test;
import com.clearspring.analytics.util.Lists;
public class DatePickerTest {
Collection<String> dates = Lists.newArrayList();
@Test
public void testPickISO() {
dates.add("2016-01-01T12:00:00Z");
dates.add("2016-06-16T12:00:00Z");
dates.add("2020-01-01T12:00:00Z");
dates.add("2020-10-01T12:00:00Z");
assertEquals("2020-10-01", DatePicker.pick(dates).getValue());
}
@Test
public void testPickSimple() {
dates.add("2016-01-01");
dates.add("2016-06-16");
dates.add("2020-01-01");
dates.add("2020-10-01");
assertEquals("2020-10-01", DatePicker.pick(dates).getValue());
}
@Test
public void testPickFrequent() {
dates.add("2016-02-01");
dates.add("2016-02-01");
dates.add("2016-02-01");
dates.add("2020-10-01");
assertEquals("2016-02-01", DatePicker.pick(dates).getValue());
}
}

View File

@ -5,6 +5,7 @@ import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue,
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
@ -28,7 +29,6 @@ object DoiBoostMappingUtil {
//STATIC STRING
val MAG = "microsoft"
val MAG_NAME = "Microsoft Academic Graph"
val ORCID = "orcid"
val ORCID_PENDING = "orcid_pending"
val CROSSREF = "Crossref"
val UNPAYWALL = "UnpayWall"
@ -37,8 +37,6 @@ object DoiBoostMappingUtil {
val doiBoostNSPREFIX = "doiboost____"
val OPENAIRE_PREFIX = "openaire____"
val SEPARATOR = "::"
val DNET_LANGUAGES = "dnet:languages"
val PID_TYPES = "dnet:pid_types"
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
@ -326,8 +324,8 @@ object DoiBoostMappingUtil {
def createORIDCollectedFrom(): KeyValue = {
val cf = new KeyValue
cf.setValue(ORCID)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
cf.setValue(ModelConstants.ORCID_DS)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ModelConstants.ORCID))
cf
}

View File

@ -87,7 +87,7 @@ case object Crossref2Oaf {
//MAPPING Crossref DOI into PID
val doi: String = (json \ "DOI").extract[String]
result.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
result.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
//MAPPING Crossref DOI into OriginalId
//and Other Original Identifier of dataset like clinical-trial-number

View File

@ -0,0 +1,65 @@
package eu.dnetlib.doiboost.crossref;
import java.io.BufferedOutputStream;
import java.net.URI;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class ExtractCrossrefRecords {
public static void main(String[] args) throws Exception {
String hdfsServerUri;
String workingPath;
String crossrefFileNameTarGz;
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
ExtractCrossrefRecords.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/crossref_dump_reader.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
workingPath = parser.get("workingPath");
crossrefFileNameTarGz = parser.get("crossrefFileNameTarGz");
Path hdfsreadpath = new Path(hdfsServerUri.concat(workingPath).concat(crossrefFileNameTarGz));
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
FSDataInputStream crossrefFileStream = fs.open(hdfsreadpath);
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(crossrefFileStream))) {
TarArchiveEntry entry = null;
while ((entry = tais.getNextTarEntry()) != null) {
if (entry.isDirectory()) {
} else {
try (
FSDataOutputStream out = fs
.create(new Path(workingPath.concat("filess/").concat(entry.getName()).concat(".gz")));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(tais, gzipOs);
}
}
}
}
Log.info("Crossref dump reading completed");
}
}

View File

@ -33,9 +33,9 @@ object SparkMapDumpIntoOAF {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
val targetPath = parser.get("targetPath")
import spark.implicits._
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
.flatMap(k => Crossref2Oaf.convert(k.json))

View File

@ -188,7 +188,7 @@ case object ConversionUtil {
val authors = inputParams._2
val pub = new Publication
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
//Set identifier as 50|doiboost____::md5(DOI)
@ -247,7 +247,7 @@ case object ConversionUtil {
val description = inputParams._2
val pub = new Publication
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
//Set identifier as 50 | doiboost____::md5(DOI)

View File

@ -30,7 +30,6 @@ public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
public static final String ORCID = "ORCID";
public final static String orcidPREFIX = "orcid_______";
public static final String OPENAIRE_PREFIX = "openaire____";
public static final String SEPARATOR = "::";
@ -69,7 +68,9 @@ public class PublicationToOaf implements Serializable {
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
{
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
put(
ModelConstants.ORCID,
new Pair<>(ModelConstants.ORCID_DS, OPENAIRE_PREFIX + SEPARATOR + ModelConstants.ORCID));
}
};
@ -102,8 +103,6 @@ public class PublicationToOaf implements Serializable {
}
}
public static final String PID_TYPES = "dnet:pid_types";
public Oaf generatePublicationActionsFromJson(final String json) {
try {
if (parsedPublications != null) {
@ -138,8 +137,8 @@ public class PublicationToOaf implements Serializable {
mapQualifier(
"sysimport:actionset:orcidworks-no-doi",
"sysimport:actionset:orcidworks-no-doi",
"dnet:provenanceActions",
"dnet:provenanceActions"));
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS));
publication.setDataInfo(dataInfo);
publication.setLastupdatetimestamp(new Date().getTime());
@ -159,7 +158,9 @@ public class PublicationToOaf implements Serializable {
publication
.getExternalReference()
.add(
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
convertExtRef(
extId, classid, classname, ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES));
}
});
@ -505,24 +506,21 @@ public class PublicationToOaf implements Serializable {
private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue();
cf.setValue(ORCID);
cf.setValue(ModelConstants.ORCID_DS);
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
return cf;
}
private KeyValue createHostedBy() {
KeyValue hb = new KeyValue();
hb.setValue("Unknown Repository");
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
return hb;
return ModelConstants.UNKNOWN_REPOSITORY;
}
private StructuredProperty mapAuthorId(String orcidId) {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(orcidId);
final Qualifier q = new Qualifier();
q.setClassid(ORCID.toLowerCase());
q.setClassname(ORCID.toLowerCase());
q.setClassid(ModelConstants.ORCID);
q.setClassname(ModelConstants.ORCID_CLASSNAME);
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
q.setSchemename(ModelConstants.DNET_PID_TYPES);
sp.setQualifier(q);
@ -535,8 +533,8 @@ public class PublicationToOaf implements Serializable {
mapQualifier(
"sysimport:crosswalk:entityregistry",
"Harvested",
"dnet:provenanceActions",
"dnet:provenanceActions"));
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS));
sp.setDataInfo(dataInfo);
return sp;
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.doiboost.uw
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Instance, Publication}
import org.json4s
import org.json4s.DefaultFormats
@ -32,7 +33,7 @@ object UnpayWallToOAF {
val is_oa = (json\ "is_oa").extract[Boolean]
val oaLocation:OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null)
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
pub.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setId(generateIdentifier(pub, doi.toLowerCase))
pub.setCollectedfrom(List(createUnpayWallCollectedFrom()).asJava)

View File

@ -0,0 +1,7 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"crossrefFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"issm", "paramLongName":"isSparkSessionManaged", "paramDescription": "the name of the activities orcid file", "paramRequired": false}
]

View File

@ -0,0 +1,68 @@
<workflow-app name="read Crossref dump from HDFS" xmlns="uri:oozie:workflow:0.5">
<parameters>
<!-- <property>-->
<!-- <name>workingPath</name>-->
<!-- <description>the working dir base path</description>-->
<!-- </property>-->
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="ReadCrossRefDump"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ReadCrossRefDump">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</main-class>
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
<arg>--workingPath</arg><arg>/data/doiboost/crossref/</arg>
<arg>--crossrefFileNameTarGz</arg><arg>crossref.tar.gz</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="SparkReadCrossRefDump">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>SparkReadCrossRefDump</name>
<class>eu.dnetlib.doiboost.crossref.ExtractCrossrefRecords</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=20
--executor-memory=6G
--driver-memory=7G
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
<arg>--workingPath</arg><arg>/data/doiboost/crossref/</arg>
<arg>--crossrefFileNameTarGz</arg><arg>crossref.tar.gz</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,372 +0,0 @@
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorIntersectionMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<!-- Itersection Parameters -->
<property>
<name>workingPath</name>
<description>the working Path</description>
</property>
<property>
<name>hostedByMapPath</name>
<description>the hostedByMap Path</description>
</property>
<property>
<name>outputPath</name>
<description>the Path of the sequence file action set</description>
</property>
<!-- Crossref Parameters -->
<property>
<name>inputPathCrossref</name>
<description>the Crossref input path</description>
</property>
<property>
<name>crossrefTimestamp</name>
<description>Timestamp for the Crossref incremental Harvesting</description>
</property>
<property>
<name>esServer</name>
<description>elasticsearch server url for the Crossref Harvesting</description>
</property>
<property>
<name>esIndex</name>
<description>elasticsearch index name for the Crossref Harvesting</description>
</property>
<!-- MAG Parameters -->
<property>
<name>MAGDumpPath</name>
<description>the MAG dump working path</description>
</property>
<property>
<name>inputPathMAG</name>
<description>the MAG working path</description>
</property>
<!-- UnpayWall Parameters -->
<property>
<name>inputPathUnpayWall</name>
<description>the UnpayWall working path</description>
</property>
<!-- ORCID Parameters -->
<property>
<name>inputPathOrcid</name>
<description>the ORCID input path</description>
</property>
<property>
<name>workingPathOrcid</name>
<description>the ORCID working path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
<case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
<case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
<case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
<case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
<case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
<case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
<default to="ImportCrossRef"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ImportCrossRef">
<java>
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
<arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--esServer</arg><arg>${esServer}</arg>
<arg>--esIndex</arg><arg>${esIndex}</arg>
<arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
</java>
<ok to="GenerateCrossrefDataset"/>
<error to="Kill"/>
</action>
<!-- CROSSREF SECTION -->
<action name="GenerateCrossrefDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateCrossrefDataset</name>
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="RenameDataset"/>
<error to="Kill"/>
</action>
<action name="RenameDataset">
<fs>
<delete path="${inputPathCrossref}/crossref_ds"/>
<move source="${inputPathCrossref}/crossref_ds_updated"
target="${inputPathCrossref}/crossref_ds"/>
</fs>
<ok to="ResetMagWorkingPath"/>
<error to="Kill"/>
</action>
<!-- MAG SECTION -->
<action name="ResetMagWorkingPath">
<fs>
<delete path="${inputPathMAG}/dataset"/>
<delete path="${inputPathMAG}/process"/>
</fs>
<ok to="ConvertMagToDataset"/>
<error to="Kill"/>
</action>
<action name="ConvertMagToDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert Mag to Dataset</name>
<class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${MAGDumpPath}</arg>
<arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ConvertCrossrefToOAF"/>
<error to="Kill"/>
</action>
<action name="ConvertCrossrefToOAF">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ConvertCrossrefToOAF</name>
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessMAG"/>
<error to="Kill"/>
</action>
<action name="ProcessMAG">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert Mag to OAF Dataset</name>
<class>eu.dnetlib.doiboost.mag.SparkProcessMAG</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorIntersectionMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
<arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessUW"/>
<error to="Kill"/>
</action>
<!-- UnpayWall SECTION -->
<action name="ProcessUW">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert UnpayWall to Dataset</name>
<class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
<arg>--targetPath</arg><arg>${workingPath}/uwPublication</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessORCID"/>
<error to="Kill"/>
</action>
<!-- ORCID SECTION -->
<action name="ProcessORCID">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert ORCID to Dataset</name>
<class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
<arg>--workingPath</arg><arg>${workingPathOrcid}</arg>
<arg>--targetPath</arg><arg>${workingPath}/orcidPublication</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="CreateDOIBoost"/>
<error to="Kill"/>
</action>
<!-- INTERSECTION SECTION-->
<action name="CreateDOIBoost">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create DOIBoost Infospace</name>
<class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorIntersectionMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
<arg>--affiliationPath</arg><arg>${inputPathMAG}/dataset/Affiliations</arg>
<arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/dataset/PaperAuthorAffiliations</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="GenerateActionSet"/>
<error to="Kill"/>
</action>
<action name="GenerateActionSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Generate DOIBoost ActionSet</name>
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
<arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
<arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
<arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
<arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
<arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
<arg>--sFilePath</arg><arg>${outputPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

File diff suppressed because one or more lines are too long

View File

@ -117,6 +117,12 @@
<artifactId>json4s-jackson_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-provision-scholexplorer</artifactId>
<version>1.2.4-SNAPSHOT</version>
</dependency>
</dependencies>

View File

@ -0,0 +1,78 @@
package eu.dnetlib.dhp.contextpropagation;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Map;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.contextpropagation.model.PropagationUse;
public class Costants implements Serializable {
private static Map<String, PropagationUse> publicationDatasetSemantics = Maps.newHashMap();
static {
publicationDatasetSemantics
.put("issupplementedby", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
publicationDatasetSemantics.put("cites", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
publicationDatasetSemantics.put("describes", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
publicationDatasetSemantics.put("references", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
publicationDatasetSemantics.put("documents", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
}
private static Map<String, PropagationUse> datasetDatasetSemantics = Maps.newHashMap();
static {
datasetDatasetSemantics.put("isdescribedby", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("iscitedby", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("cites", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("issupplementedby", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("issupplementto", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("iscontinuedby", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("continues", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("hasversion", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("isversionof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("isnewversionof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics
.put("ispreviousversionof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("ispartof", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("haspart", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("references", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("isreferencedby", PropagationUse.newInstance("reuse", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("documents", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("isdocumentedby", PropagationUse.newInstance("latent", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("isvariantformof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("isoriginalformof", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("isidenticalto", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("obsoletes", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
datasetDatasetSemantics.put("isobsoletedby", PropagationUse.newInstance("proxy", "1.0", new ArrayList<>()));
}
public static Map<String, PropagationUse> getPublicationDatasetSemantics() {
return publicationDatasetSemantics;
}
public static Map<String, PropagationUse> getDatasetDatasetSemantics() {
return datasetDatasetSemantics;
}
public static boolean containedInPubSem(String sem) {
return publicationDatasetSemantics.containsKey(sem);
}
public static boolean containedInDatsSem(String sem) {
return datasetDatasetSemantics.containsKey(sem);
}
public static PropagationUse getPublicationValue(String sem) {
return publicationDatasetSemantics.get(sem);
}
public static PropagationUse getDatasetValue(String sem) {
return datasetDatasetSemantics.get(sem);
}
}

View File

@ -0,0 +1,60 @@
package eu.dnetlib.dhp.contextpropagation
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, PropagationStructure}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders}
object PropagationAggregator {
def getDatasetAggregator(): Aggregator[(String, PropagationStructure), PropagationStructure, PropagationStructure] = new Aggregator[(String, PropagationStructure), PropagationStructure, PropagationStructure]{
override def zero: PropagationStructure = new PropagationStructure()
override def reduce(b: PropagationStructure, a: (String, PropagationStructure)): PropagationStructure = {
b.mergeFrom(a._2)
}
override def merge(wx: PropagationStructure, wy: PropagationStructure): PropagationStructure = {
wx.mergeFrom(wy)
}
override def finish(reduction: PropagationStructure): PropagationStructure = reduction
override def bufferEncoder: Encoder[PropagationStructure] =
Encoders.kryo(classOf[PropagationStructure])
override def outputEncoder: Encoder[PropagationStructure] =
Encoders.kryo(classOf[PropagationStructure])
}
def mergeEnrichedEntries(): Aggregator[(String, EnrichedEntries), EnrichedEntries, EnrichedEntries] = new Aggregator[(String, EnrichedEntries), EnrichedEntries, EnrichedEntries]{
override def zero: EnrichedEntries = new EnrichedEntries()
override def reduce(b: EnrichedEntries, a: (String, EnrichedEntries)): EnrichedEntries = {
b.mergeWith(a._2)
}
override def merge(wx: EnrichedEntries, wy: EnrichedEntries): EnrichedEntries = {
wx.mergeWith(wy)
}
override def finish(reduction: EnrichedEntries): EnrichedEntries = reduction
override def bufferEncoder: Encoder[EnrichedEntries] =
Encoders.kryo(classOf[EnrichedEntries])
override def outputEncoder: Encoder[EnrichedEntries] =
Encoders.kryo(classOf[EnrichedEntries])
}
}

View File

@ -0,0 +1,250 @@
package eu.dnetlib.dhp.contextpropagation
import java.util
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import eu.dnetlib.dhp.contextpropagation.model.{DatasetPropagationStructure, EnrichedEntries, MapSxOA, Node, PropagationStructure, PropagationUse, RelationPropagation}
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixEntityId}
import eu.dnetlib.dhp.provision.scholix.summary.{SchemeValue, ScholixSummary}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import scala.collection.mutable.ListBuffer
import scala.collection.JavaConverters._
object PropagationUtils {
implicit val enrichedEntitiesEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
def getSelectedNodes(path: String, spark:SparkSession): Dataset[String] ={
implicit val stringEncoder: Encoder[String] = Encoders.STRING
val allowedRelations = spark.read.load(path).as[RelationPropagation]
val pubs_rel : Dataset[RelationPropagation] = allowedRelations
.filter(r => r.getSource.getId.startsWith("50")
&& r.getTarget.getId.startsWith("60")
&& Costants.containedInPubSem(r.getSemantics.toLowerCase()))
val dats_rel : Dataset[RelationPropagation] = allowedRelations
.filter(r => r.getSource.getId.startsWith("60")
&& r.getTarget.getId.startsWith("60")
&& Costants.containedInDatsSem(r.getSemantics.toLowerCase())
&& r.getSource.getId != r.getTarget.getId)
pubs_rel.map(r => r.getSource.getId).union(pubs_rel.map(r => r.getTarget.getId))
.union(dats_rel.map(r => r.getSource.getId)).union(dats_rel.map(r => r.getTarget.getId)).distinct()
}
def getSubjectList(value: util.List[String], scheme: util.List[String]): util.List[SchemeValue] = {
var subjects = new ListBuffer[SchemeValue]()
var i = 0
for (elem <- value.asScala) {
val sv :SchemeValue = new SchemeValue()
sv.setScheme(scheme.get(i))
sv.setValue(elem)
subjects += sv
i += 1
}
subjects.toList.asJava
}
def propagateDataset (item: ((String, PropagationStructure), (String, DatasetPropagationStructure))) : List[(String, PropagationStructure)] = {
val lst = new ListBuffer[(String,PropagationStructure)]()
lst += item._1
if(item._2 != null){
val propagation = item._1._2.getPropagation.asScala
val dsprob: DatasetPropagationStructure = item._2._2
val source = dsprob.getPropagation.keySet().iterator().next()
val dic = new scala.collection.mutable.HashMap[String, util.List[PropagationUse]]
propagation.keysIterator.foreach(key => {
val entries = propagation.get(key).get.asScala
entries.foreach(entry => {
if ((entry.getUse == dsprob.getPropagation.get(source).getUse || dsprob.getPropagation.get(source).getUse == "proxy")
&& !entry.getPath.contains(source)) {
var new_p = entry.getWeight.toDouble * dsprob.getPropagation.get(source).getWeight.toDouble
if (new_p > 0.3) {
var newentry: PropagationUse = PropagationUse.copyInstance(entry)
newentry.setWeight(String.valueOf(new_p))
if(!newentry.getPath.contains(item._1._1))
newentry.getPath.add(item._1._1)
if (!dic.keySet.contains(key)) {
dic.put(key, new util.ArrayList[PropagationUse]())
}
dic(key).add(newentry)
}
}
})
})
var ps: PropagationStructure = new PropagationStructure
ps.setPropagation(dic.asJava)
lst += ((source, ps))
}
lst.toList
}
def enrichScholix(summary_path: String, spark: SparkSession): Dataset[EnrichedEntries] = {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
spark.read.load(summary_path).as[ScholixSummary]
.map(ss => {
val ee: EnrichedEntries = new EnrichedEntries()
ee.setScholixId(ss.getId)
ee.setTitle(ss.getTitle)
if (ss.getDescription != null) {
ee.setDescription(List(ss.getDescription).asJava)
} else {
ee.setDescription(new util.ArrayList[String]())
}
if (ss.getSubject != null) {
ee.setSubject(ss.getSubject)
} else {
ee.setSubject(new util.ArrayList[SchemeValue]())
}
if (ee.getDescription.size() > 0 && ee.getSubject.size() > 0){
ee
}
else{
null
}
})
}
def mergeEnrichments(item : ((String, EnrichedEntries), (String, EnrichedEntries)) ): EnrichedEntries = {
if (item._1 == null)
item._2._2
if (item._2 == null)
item._1._2
item._2._2.mergeWith(item._1._2)
}
/*#reads the scholixexplorer scholix dump. It filters out records with prefix 70 (unknown)
input = sc.textFile('/user/dnet.scholexplorer/scholix/provision/scholix_dump/scholix_json').map(json.loads).filter(lambda x: x['source']['dnetIdentifier'][0:2]!='70')
sources = input.map(lambda x: x['source']['dnetIdentifier']).distinct()
#le relazioni sono bidirezionali quindi l'insieme di nodi e' identificato univocamente dai sources
nodes = input.map(lambda x: x['source']['dnetIdentifier']).distinct().map(lambda x: prefix + x[3:] if not 'dedup' in x else prefix + x[17:]).distinct().map(lambda x : {'scholix' : x})
#createa a mapping between the original scholexplorer ids and the ids the records will have in OpenAIRE
scholexplorerMapOpenaire = sources.map(lambda x: {'scholexplorer': x, 'openaire': prefix + x[3:] if 'dedup' not in x else prefix + x[17:]})
scholexplorerMapOpenaire.map(json.dumps).saveAsTextFile(path = '/tmp/miriam/context_propagation/scholexplorerIdsMapOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
#reads the summaries (containing title, subject and description ) for the scholexplorer records
summaries = sc.textFile('/user/dnet.scholexplorer/scholix/provision/summary_json').map(json.loads).map(lambda x: {"title": x['title'], "description":[x['description']] if x['description'] is not None else [], "subject":x['subject'], "sid":x['id']})
sources = sources.map(lambda x: {'id':x})
#enriches data with summaries information from scholexplorer
sdf = sources.toDF()
smdf = summaries.toDF()
enriched_sx = sdf.join(smdf, sdf.id == smdf.sid).rdd.map(lambda x: {"id" : x['id'], "abstract": x['description'], "title":x['title'], "subject":x['subject']})
enriched_sx.map(json.dumps).saveAsTextFile(path = '/tmp/miriam/context_propagation/scholexplorerEnrichedSX', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
nodesdf = nodes.toDF()
nodesdf.createOrReplaceTempView("ids")
#associates the dedup openaire identifier to ingested scholix ids
dfr = spark.read.json('/tmp/beta_provision/graph/13_graph_blacklisted/relation')
relation = dfr.createOrReplaceTempView("relation")
mergedIds = spark.sql("SELECT source, target from relation join ids on relation.target = ids.scholix where datainfo.deletedbyinference = false and relclass = 'merges' ")
mergedIds.rdd.map(lambda x: {'dedup':x['source'], 'scholixId':x['target']}).map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholixIdsMergedInOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
#new execution on dfr = spark.read.json('/tmp/miriam/context_propagation/14_graph_blacklisted/relation')
#this will be saved as the one above with the postfix 'production'
#replace the ingested scholix id with the deduped one
mergedIds.createOrReplaceTempView("merged")
changeInMerged = spark.sql("Select * from ids left join merged on ids.scholix = merged.target")
enrich = changeInMerged.rdd.map(lambda x: x['source'] if x['source'] is not None else x['scholix']).distinct().map(lambda x:{"enrich":x})
edf = enrich.toDF()
edf.createOrReplaceTempView("enrich_ids")
ddf = spark.read.json('/tmp/beta_provision/graph/13_graph_blacklisted/dataset')
ddf.createOrReplaceTempView("dataset")
#new execution on ddf = spark.read.json('/tmp/miriam/context_propagation/14_graph_blacklisted/dataset')
#enriches the scholix ingested records with information for title abstract and subject found in OpenAIRE
enriched_dataset = spark.sql("select a.* from (select id, title.value title, description.value description, collect_set(named_struct('scheme', MyS.qualifier.classid, 'value', MyS.value)) as subjects from dataset lateral view explode (subject)s as MyS where datainfo.deletedbyinference = false group by title.value, description.value, id) as a join enrich_ids on a.id = enrich_ids.enrich")
pdf = spark.read.json('/tmp/beta_provision/graph/13_graph_blacklisted/publication')
#new execution on pdf = spark.read.json('/tmp/miriam/context_propagation/14_graph_blacklisted/publication')
pdf.createOrReplaceTempView("publication")
enriched_publication = spark.sql("select a.* from (select id, title.value title, description.value description, collect_set(named_struct('scheme', MyS.qualifier.classid, 'value', MyS.value)) as subjects from publication lateral view explode (subject)s as MyS where datainfo.deletedbyinference = false group by title.value, description.value, id) as a join enrich_ids on a.id = enrich_ids.enrich")
enriched = enriched_dataset.rdd.map(lambda x: {"id":x['id'], 'title':x['title'], 'abstract':x['description'], 'subject':[{'scheme': subject['scheme'], "value": subject['value']} for subject in x['subjects']]}).union(enriched_publication.rdd.map(lambda x: {"id":x['id'], 'title':x['title'], 'abstract':x['description'], 'subject':[{'scheme': subject['scheme'], "value": subject['value']} for subject in x['subjects']]}))
enriched.map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholixIdsEnrichedInOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
enriched.toDF().createOrReplaceTempView("enriched")
changeInMerged.createOrReplaceTempView("merged_ids")
#associo gli openaireid eventualmente deduplicati come l'id scholix originale in openaire
scholixIds = spark.sql("select scholix, id, title, abstract, subject from enriched left join merged_ids on id = source").rdd.map(lambda x: {"id":x['scholix'] if x['scholix'] is not None else x['id'], "title":x['title'], "abstract":x['abstract'],'subject':x['subject']})
scholixIds.map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholixIdsEnrichedOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
sids=scholixIds.toDF()
sids.createOrReplaceTempView("scholix")
mdf = scholexplorerMapOpenaire.toDF()
mdf.createOrReplaceTempView("map")
#original scholexplorer ids with the enrichment from openaire
scholexplorerEnrichedOA = spark.sql("Select scholexplorer, title, abstract, subject from scholix join map on id = map.openaire")
scholexplorerEnrichedOA = scholexplorerEnrichedOA.rdd.map(lambda x: {'id': x['scholexplorer'], 'title':x['title'], 'abstract':x['abstract'],'subject':x['subject']} )
scholexplorerEnrichedOA.map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholexplorerEnrichedOA', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
sxEnriched = enriched_sx.union(scholexplorerEnrichedOA).map(lambda x: (x['id'],x)).groupByKey().map(groupFunction)
sxEnriched.map(json.dumps).saveAsTextFile(path='/tmp/miriam/context_propagation/scholexplorerEnriched', compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
#select the set of relevant relationships
relation = sc.textFile('/user/dnet.scholexplorer/scholix/provision/scholix_dump/scholix_json').map(json.loads).filter(lambda x: x['source']['dnetIdentifier'][0:2]!='70' and x['target']['dnetIdentifier'][0:2] != '70')
pub_rel = relation.filter(lambda x: x['source']['objectType'] == 'publication' and x['target']['objectType'] == 'dataset').map(lambda x: {'source':x['source']['dnetIdentifier'], 'semantics':x['relationship']['name'].lower(), 'target':x['target']['dnetIdentifier']})
#167,210,655
#contare le semantiche nel sottografo identificato dalle publication con relzioni verso dataset
pub_rel.map(lambda x: (x['semantics'], 1)).reduceByKey(lambda a,b: a+b).collect()
#[(u'iscitedby', 11542), (u'reviews', 2051), (u'iscompiledby', 499), (u'unknown', 111706), (u'isnewversionof', 27977), (u'requires', 1), (u'isdocumentedby', 747), (u'describes', 211), (u'issourceof', 30877), (u'ismetadataof', 11), (u'isversionof', 269006), (u'ispartof', 454244), (u'issupplementedby', 1517666), (u'obsoletes', 5), (u'isreferencedby', 89986753), (u'isvariantformof', 3688), (u'hasassociationwith', 30), (u'isidenticalto', 293876), (u'haspart', 621177), (u'ismetadatafor', 121), (u'isrelatedto', 70310923), (u'issupplementto', 85460), (u'isoriginalformof', 476), (u'iscontinuedby', 356407), (u'cites', 200336), (u'ispreviousversionof', 24119), (u'hasversion', 273427), (u'isdescribedby', 5), (u'continues', 356582), (u'isreviewedby', 53), (u'documents', 265636), (u'compiles', 177), (u'references', 2004247), (u'isobsoletedby', 2), (u'isderivedfrom', 617)]
pub_dats_sem = {'issupplementedby':{'use':'latent', 'weight':1.0, 'path':set()}, 'cites':{'use':'reuse', 'weight':1.0, 'path':set()}, 'describes':{'use':'latent', 'weight':1.0, 'path':set()},'references':{'use':'reuse', 'weight':1.0, 'path':set()}, 'documents':{'use':'latent','weight':1, 'path':set()}}
pub_rel_subset = pub_rel.filter(lambda x: x['semantics'] in pub_dats_sem)
pub_rel_subset.count()
#3,988,096
pubdf = pub_rel_subset.toDF()
sxdf = sxEnriched.toDF()
pubs_enriched = pubdf.join(sxdf, pubdf.source == sxdf.id)
pubs_with_abst = pubs_enriched.rdd.filter(lambda x: x['abstract'] != [] or x['subject'] != []).map(lambda x: {'source':x['source'], 'semantics':x['semantics'], 'target': x['target']})
pubs_with_abst.count()
*/
}

View File

@ -0,0 +1,113 @@
package eu.dnetlib.dhp.contextpropagation
import java.util
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.contextpropagation.model.{DatasetPropagationStructure, EnrichedEntries, Node, PropagationStructure, PropagationUse, RelationPropagation}
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
import eu.dnetlib.dhp.provision.scholix.Scholix
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import scala.collection.JavaConverters._
object SparkContextPropagation {
implicit val relationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
implicit val tupleForRelation: Encoder[(String, RelationPropagation)] = Encoders.tuple(Encoders.STRING, relationEncoder)
implicit val enrichedEntitiesEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
implicit val tupleForEntities: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEntitiesEncoder)
implicit val mapEncoderPub: Encoder[PropagationStructure] = Encoders.kryo[PropagationStructure]
implicit val mapEncoderDats: Encoder[DatasetPropagationStructure] = Encoders.kryo[DatasetPropagationStructure]
implicit val tupleForPropagation: Encoder[(String, PropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
implicit val tupleForPropagationDars: Encoder[(String, DatasetPropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderDats)
implicit val stringEncoder: Encoder[String] = Encoders.STRING
def getEnrichedPublications (allowedRelations: Dataset[RelationPropagation], enrichedEntitiesPath: String, spark: SparkSession): Dataset[RelationPropagation] = {
val startingPropagation = allowedRelations
.filter(r => r.getSource.getId.startsWith("50")).map(r => (r.getSource.getId, r))
val enrichedNodes = spark.read.load(enrichedEntitiesPath).as[EnrichedEntries]
.map(e => (e.getScholixId, e))
startingPropagation.joinWith(enrichedNodes, startingPropagation("_1").equalTo(enrichedNodes("_1"))).map(tuple => tuple._1._2)
}
def propagatePublicationDataset(pubs_rel : Dataset[RelationPropagation]): Dataset[(String, PropagationStructure)] ={
val publication_dataset : Dataset[(String, PropagationStructure)] = pubs_rel.map(r => {
val ps = new PropagationStructure
val pv : List[PropagationUse] = List(PropagationUse.copyInstance(Costants.getPublicationValue(r.getSemantics)))
ps.add(r.getSource.getId, pv.asJava)
(r.getTarget.getId, ps)
})
publication_dataset.groupByKey(_._1)
.agg(PropagationAggregator.getDatasetAggregator().toColumn)
}
def propagateDatasetDataset(propagation: Dataset[(String, PropagationStructure)], dataset_dataset : Dataset[(String, DatasetPropagationStructure)], count :Int): Dataset[(String, PropagationStructure)] = {
val pl2_step1 : Dataset [(String, PropagationStructure)] = propagation.joinWith(dataset_dataset, propagation("_1").equalTo(dataset_dataset("_1")))
.flatMap(PropagationUtils.propagateDataset)
val pl2 : Dataset [(String, PropagationStructure)] = propagation.union(pl2_step1).groupByKey(_._1)
.agg(PropagationAggregator.getDatasetAggregator().toColumn)
pl2
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertDatasetToJson.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/dataset2Json.json")))
parser.parseArgument(args)
val conf = new SparkConf
val spark = SparkSession.builder.config(conf).appName(SparkConvertDatasetToJson.getClass.getSimpleName).master(parser.get("master")).getOrCreate
val propagationOutputPath = parser.get("propagationOutputPath")
val allowedRelations = spark.read.load(parser.get("allowedRelationPath")).as[RelationPropagation]
val dataset_dataset : Dataset[(String, DatasetPropagationStructure)] = allowedRelations.filter(r => r.getSource.getId.startsWith("60"))
.map(r => {
val ps = new DatasetPropagationStructure
ps.add(r.getTarget.getId, PropagationUse.copyInstance(Costants.getDatasetValue(r.getSemantics)))
(r.getSource.getId, ps)
})
val pl1 : Dataset[(String, PropagationStructure)] = propagatePublicationDataset(
getEnrichedPublications(allowedRelations, parser.get("enrichedEntitiesPath"), spark ))
pl1.write.mode(SaveMode.Overwrite).save(s"$propagationOutputPath/pl1")
var propagation = pl1
var count = 1
do {
count += 1
propagation = propagateDatasetDataset(propagation, dataset_dataset, count )
propagation.write.mode(SaveMode.Overwrite).save(s"$propagationOutputPath/pl${count}" )
}
while(propagation.count() > 0)
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.contextpropagation
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.contextpropagation.model.{DatasetPropagationStructure, EnrichedEntries, MapSxOA, Node, PropagationStructure, PropagationUse, RelationPropagation}
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
import eu.dnetlib.dhp.provision.scholix.Scholix
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
/**
* It takes the summaries of the scholexplorer nodes involved in propagation
*/
object SparkEnrichScholixStep1 {
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
def getEnrichedSubset(scholixSelectedRelationPath:String, summaryPath: String, spark:SparkSession): Dataset[EnrichedEntries] = {
//selects the scholix nodes involved in propagation
val distinctNodes: Dataset[String] = PropagationUtils.getSelectedNodes(scholixSelectedRelationPath , spark)
val scholixSummaries = PropagationUtils.enrichScholix(summaryPath, spark).filter(o => o != null)
.map(e => (e.getScholixId, e))
//enriches the selected nodes with summary from scholexplorer
distinctNodes.joinWith(scholixSummaries, distinctNodes("value").equalTo(scholixSummaries("_1"))).map(pair => pair._2._2).filter(o => o != null)
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEnrichScholixStep1.getClass.getResourceAsStream("/eu/dnetlib/dhp/contextpropagation/enrich-scholexplorer.json")))
parser.parseArgument(args)
val conf = new SparkConf
val spark = SparkSession.builder.config(conf).appName(SparkEnrichScholixStep1.getClass.getSimpleName).master(parser.get("master")).getOrCreate
getEnrichedSubset(parser.get("inputPath"), parser.get("scholixSummaryPath"), spark)
.write.mode(SaveMode.Overwrite).save(parser.get("outputPath"))
}
}

View File

@ -0,0 +1,108 @@
package eu.dnetlib.dhp.contextpropagation
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, MapSxOA}
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
import eu.dnetlib.dhp.provision.scholix.Scholix
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
/**
* it takes enrichment from OpenAIRE. It considers only deduped entities since those not deduped have their
* enrichment directly from scholexplorer
*
* One step for each result type
*/
object SparkEnrichScholixStep2 {
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
implicit val mapEncoder: Encoder[MapSxOA] = Encoders.kryo[MapSxOA]
implicit val tupleForJoinMap: Encoder[(String, MapSxOA)] = Encoders.tuple(Encoders.STRING, mapEncoder)
def getMappingScholexplorerOpenAIRE(scolixPath: String, spark:SparkSession): Dataset[MapSxOA] ={
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val mapEncoder: Encoder[MapSxOA] = Encoders.kryo[MapSxOA]
var prefix = "50|scholix_____::"
spark.read.load(scolixPath).as[Scholix]
.map(s => s.getSource.getDnetIdentifier)(Encoders.STRING)
.filter(id => !id.startsWith("70|"))
.distinct()
.map(id => {
val map : MapSxOA = new MapSxOA()
if(id.contains("dedup")){
map.setOaid(prefix + id.substring(17))
}else{
map.setOaid(prefix + id.substring(3))
}
map.setScholixId(id)
map
})
}
def enrichOpenAIRE(resourcePath: String, relationPath : String, spark:SparkSession): Dataset[EnrichedEntries] = {
val mapper = new ObjectMapper()
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
val result = spark.read.json(resourcePath)
val relation = spark.read.json(relationPath)
result.createOrReplaceTempView("result")
relation.createOrReplaceTempView("relation")
spark.sql("SELECT id, target, title.value title, description.value description, subject.value svalues, subject.qualifier.classid sscheme " +
" FROM relation" +
" JOIN result " +
" ON relation.source = result.id " +
" WHERE relation.datainfo.deletedbyinference = false " +
" AND relclass = 'merges'" +
" AND relation.target like '50|scholix%' "
)
.map(line => {
val ee : EnrichedEntries = new EnrichedEntries()
ee.setOpenAireId(line.getString(1))
ee.setTitle(line.getList(2))
ee.setDescription(line.getList(3))
ee.setSubject(PropagationUtils.getSubjectList(line.getList(4), line.getList(5)))
ee
})
}
def getEnrichedSubset(scholixPath:String, relationPath:String, resultPath: String, spark:SparkSession): Dataset[EnrichedEntries] = {
val openAireInfo = enrichOpenAIRE(resultPath, relationPath, spark)
.map(r => (r.getOpenAireId, r))
val mapping = getMappingScholexplorerOpenAIRE(scholixPath , spark).map(m => (m.getOaid, m))
mapping.joinWith(openAireInfo, mapping("_1").equalTo(openAireInfo("_1"))).map(t => {
val ret :EnrichedEntries = t._2._2
ret.setScholixId(t._1._2.getScholixId)
ret
})
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEnrichScholixStep2.getClass.getResourceAsStream("/eu/dnetlib/dhp/contextpropagation/enrich-openaire.json")))
parser.parseArgument(args)
val conf = new SparkConf
val spark = SparkSession.builder.config(conf).appName(SparkEnrichScholixStep2.getClass.getSimpleName).master(parser.get("master")).getOrCreate
getEnrichedSubset(parser.get("scholixPath"), parser.get("relationPath"), parser.get("resultPath"), spark)
.filter(o => o != null)
.write.mode(SaveMode.Overwrite).save(parser.get("outputPath"))
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.contextpropagation
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, MapSxOA}
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
/**
* It puts together the outcome of the two previous step to get all the enrichemnts in one single entry
*/
object SparkEnrichScholixStep3 {
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
def getEnriched(scholixPath:String, openairePath: String, spark:SparkSession): Dataset[EnrichedEntries] = {
spark.read.load(scholixPath).as[EnrichedEntries]
.union(spark.read.load(s"$openairePath/publication").as[EnrichedEntries])
.union(spark.read.load(s"$openairePath/dataset").as[EnrichedEntries])
.union(spark.read.load(s"$openairePath/software").as[EnrichedEntries])
.union(spark.read.load(s"$openairePath/otherresearchproduct").as[EnrichedEntries])
.map(ee => (ee.getScholixId, ee))
.groupByKey(_._1)(Encoders.STRING)
.agg(PropagationAggregator.mergeEnrichedEntries().toColumn).map(c => c._2)
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEnrichScholixStep3.getClass.getResourceAsStream("/eu/dnetlib/dhp/contextpropagation/enrich-scholexplorer.json")))
parser.parseArgument(args)
val conf = new SparkConf
val spark = SparkSession.builder.config(conf).appName(SparkEnrichScholixStep3.getClass.getSimpleName).master(parser.get("master")).getOrCreate
getEnriched(parser.get("scholixEnrichedPath"), parser.get("openaireEnrichedPath"), spark)
.write.mode(SaveMode.Overwrite).save(parser.get("outputPath"))
}
}

View File

@ -0,0 +1,62 @@
package eu.dnetlib.dhp.contextpropagation
/**
* Selects all the Scholexplorer relations not involving nodes with prefix 70 (unknown) and for which the source node
* is different from the target node
*/
import java.util
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.contextpropagation.model.{Node, RelationPropagation}
import eu.dnetlib.dhp.provision.SparkConvertDatasetToJson
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixEntityId}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import scala.collection.JavaConverters._
object SparkSelectScholixRelations {
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
def getPublisherList(item: List[ScholixEntityId]) : util.List[String] =
{
item.map(p=>p.getName).asJava
}
def getAllowedRelations(scholixPath: String, spark:SparkSession): Dataset[RelationPropagation] = {
spark.read.load(scholixPath).as[Scholix]
.filter(s => !s.getSource().getDnetIdentifier().substring(0,2).equals("70") )
.filter(s => !s.getTarget().getDnetIdentifier().substring(0,2).equals("70"))
.filter(s => !s.getSource.getDnetIdentifier.equals(s.getTarget.getDnetIdentifier))
.map(s => {
val rp = new RelationPropagation
if(s.getSource.getPublisher != null)
rp.setSource(Node.newInstance(s.getSource.getDnetIdentifier, getPublisherList(s.getSource.getPublisher.asScala.toList)))
else
rp.setSource(Node.newInstance(s.getSource.getDnetIdentifier, new util.ArrayList()))
if(s.getTarget.getPublisher != null)
rp.setTarget(Node.newInstance(s.getTarget.getDnetIdentifier, getPublisherList(s.getTarget.getPublisher.asScala.toList)))
else
rp.setTarget(Node.newInstance(s.getTarget.getDnetIdentifier, new util.ArrayList()))
rp.setSemantics(s.getRelationship.getName.toLowerCase())
rp
})
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSelectScholixRelations.getClass.getResourceAsStream("/eu/dnetlib/dhp/contextpropagation/enrich-scholexplorer.json")))
parser.parseArgument(args)
val conf = new SparkConf
val spark = SparkSession.builder.config(conf).appName(SparkSelectScholixRelations.getClass.getSimpleName).master(parser.get("master")).getOrCreate
getAllowedRelations(parser.get("inputPath") , spark).write.mode(SaveMode.Overwrite).save(parser.get("outputPath"))
}
}

View File

@ -0,0 +1,22 @@
package eu.dnetlib.dhp.contextpropagation.model;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import eu.dnetlib.dhp.contextpropagation.model.PropagationUse;
public class DatasetPropagationStructure implements Serializable {
private Map<String, PropagationUse> propagation = new HashMap<>();
public Map<String, PropagationUse> getPropagation() {
return propagation;
}
public void add(String key, PropagationUse value) {
propagation.put(key, value);
}
}

View File

@ -0,0 +1,148 @@
package eu.dnetlib.dhp.contextpropagation.model;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.avro.generic.GenericData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap;
import eu.dnetlib.dhp.provision.scholix.summary.SchemeValue;
import eu.dnetlib.dhp.schema.dump.oaf.GeoLocation;
public class EnrichedEntries implements Serializable {
private static final Logger log = LoggerFactory.getLogger(EnrichedEntries.class);
private String scholixId;
private String openAireId;
private List<String> title;
private List<String> description;
private List<SchemeValue> subject;
public String getScholixId() {
return scholixId;
}
public void setScholixId(String scholixId) {
this.scholixId = scholixId;
}
public List<String> getTitle() {
return title;
}
public void setTitle(List<String> title) {
this.title = title;
}
public List<String> getDescription() {
return description;
}
public void setDescription(List<String> description) {
this.description = description;
}
public List<SchemeValue> getSubject() {
return subject;
}
public void setSubject(List<SchemeValue> subject) {
this.subject = subject;
}
public String getOpenAireId() {
return openAireId;
}
public void setOpenAireId(String openAireId) {
this.openAireId = openAireId;
}
public EnrichedEntries mergeWith(EnrichedEntries ee) throws JsonProcessingException {
if(ee == null){
throw new RuntimeException("ERROR: ee is null");
}
if (scholixId == null)
scholixId = ee.scholixId;
if (openAireId == null)
openAireId = ee.openAireId;
try {
Optional
.ofNullable(ee.getDescription())
.ifPresent(
d -> d
.stream()
.filter(Objects::nonNull)
.forEach(this::mergeAbstract));
Optional
.ofNullable((ee.getTitle()))
.ifPresent(
t -> t
.stream()
.filter(Objects::nonNull)
.forEach(this::mergeTitle));
Optional
.ofNullable(ee.getSubject())
.ifPresent(
s -> s
.stream()
.filter(Objects::nonNull)
.forEach(this::mergeSubject));
} catch (Exception e) {
throw new RuntimeException("Error in merging " + ee.getScholixId(), e);
}
return this;
}
private void mergeSubject(SchemeValue sbj) {
if (subject == null) {
subject = new ArrayList<>();
}
for (SchemeValue s : subject) {
if (s.getValue().equals(sbj.getValue())) {
return;
}
}
subject.add(sbj);
}
private void mergeAbstract(String dex) {
if (description == null) {
description = new ArrayList<>();
}
merge(dex, description);
}
private void mergeTitle(String t) {
if (title == null) {
title = new ArrayList<>();
}
merge(t, title);
}
private void merge(String st, List<String> lst) {
for (String d : lst) {
if (d.equals(st))
return;
}
lst.add(st);
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.contextpropagation.model;
import java.io.Serializable;
public class MapSxOA implements Serializable {
private String scholixId;
private String oaid;
public String getScholixId() {
return scholixId;
}
public void setScholixId(String scholixId) {
this.scholixId = scholixId;
}
public String getOaid() {
return oaid;
}
public void setOaid(String oaid) {
this.oaid = oaid;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.contextpropagation.model;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
public class Node implements Serializable {
private String id;
private List<String> publisher;
public List<String> getPublisher() {
return publisher;
}
public void setPublisher(ArrayList<String> publisher) {
this.publisher = publisher;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public static Node newInstance(String id, List publisher) {
Node n = new Node();
n.id = id;
n.publisher = publisher;
return n;
}
}

View File

@ -0,0 +1,62 @@
package eu.dnetlib.dhp.contextpropagation.model;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class PropagationStructure implements Serializable {
private Map<String, List<PropagationUse>> propagation = new HashMap<>();
public Map<String, List<PropagationUse>> getPropagation() {
return propagation;
}
public void add(String key, List<PropagationUse> value) {
propagation.put(key, value);
}
public void setPropagation(Map<String, List<PropagationUse>> propagation) {
this.propagation = propagation;
}
private void mergeList(PropagationUse use, List<PropagationUse> acc) {
if (acc == null) {
acc = new ArrayList<>();
}
for (PropagationUse pu : acc) {
if (use.getUse().equals(pu.getUse())) {
pu.getPath().addAll(use.getPath());
if (Integer.valueOf(pu.getWeight()) < Integer.valueOf(use.getWeight())) {
pu.setWeight(use.getWeight());
return;
}
}
}
acc.add(use);
}
public PropagationStructure mergeFrom(PropagationStructure ps) {
if (ps == null)
return this;
for (String key : ps.propagation.keySet()) {
if (propagation.containsKey(key)) {
ps.propagation.get(key).forEach(use -> mergeList(use, propagation.get(key)));
} else {
propagation
.put(
key,
ps.propagation
.get(key)
.stream()
.map(pu -> PropagationUse.copyInstance(pu))
.collect(Collectors.toList()));
}
}
return this;
}
}

View File

@ -0,0 +1,52 @@
package eu.dnetlib.dhp.contextpropagation.model;
import java.io.Serializable;
import java.util.List;
public class PropagationUse implements Serializable {
private String use;
private String weight;
private List<String> path;
public String getUse() {
return use;
}
public void setUse(String use) {
this.use = use;
}
public String getWeight() {
return weight;
}
public void setWeight(String weight) {
this.weight = weight;
}
public List<String> getPath() {
return path;
}
public void setPath(List<String> path) {
this.path = path;
}
public static PropagationUse newInstance(String use, String weight, List<String> path) {
PropagationUse pu = new PropagationUse();
pu.use = use;
pu.weight = weight;
pu.path = path;
return pu;
}
public static PropagationUse copyInstance(PropagationUse use) {
PropagationUse pu = new PropagationUse();
pu.path = use.path;
pu.weight = use.weight;
pu.use = use.use;
return pu;
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.contextpropagation.model;
import java.io.Serializable;
import java.util.ArrayList;
public class Publisher extends ArrayList<String> implements Serializable {
public Publisher() {
super();
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.dhp.contextpropagation.model;
import java.io.Serializable;
import eu.dnetlib.dhp.contextpropagation.model.Node;
public class RelationPropagation implements Serializable {
private Node source;
private Node target;
private String semantics;
public RelationPropagation() {
}
public Node getSource() {
return source;
}
public void setSource(Node source) {
this.source = source;
}
public Node getTarget() {
return target;
}
public void setTarget(Node target) {
this.target = target;
}
public String getSemantics() {
return semantics;
}
public void setSemantics(String semantics) {
this.semantics = semantics;
}
}

View File

@ -24,8 +24,6 @@ public class Constants {
public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative";
public static String ORCID = "orcid";
static {
accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec");

View File

@ -503,7 +503,7 @@ public class ResultMapper implements Serializable {
private static Pid getOrcid(List<StructuredProperty> p) {
for (StructuredProperty pid : p) {
if (pid.getQualifier().getClassid().equals(Constants.ORCID)) {
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
if (di.isPresent()) {
return Pid

View File

@ -68,7 +68,7 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
protected static final Qualifier ORCID_PID_TYPE = qualifier(
"ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES);
ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, DNET_PID_TYPES, DNET_PID_TYPES);
protected static final Qualifier MAG_PID_TYPE = qualifier(
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);

View File

@ -19,6 +19,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
@ -61,7 +62,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
author.setPid(new ArrayList<>());
if (StringUtils.isNotBlank(pid)) {
if (type.startsWith("ORCID")) {
if (type.toLowerCase().startsWith(ORCID)) {
final String cleanedId = pid
.replaceAll("http://orcid.org/", "")
.replaceAll("https://orcid.org/", "");

View File

@ -20,6 +20,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
@ -98,7 +99,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
.replaceAll(" ", "")
.replaceAll("_", "");
if (type.startsWith("ORCID")) {
if (type.toLowerCase().startsWith(ModelConstants.ORCID)) {
final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", "");
res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
} else if (type.startsWith("MAGID")) {

View File

@ -0,0 +1,31 @@
[{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
}
,{
"paramName": "sp",
"paramLongName": "scholixPath",
"paramDescription": "the path of the scholix summaries",
"paramRequired": false
},
{
"paramName": "rp",
"paramLongName": "relationPath",
"paramDescription": "the openaire graph input path",
"paramRequired": false
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
},{
"paramName": "rePath",
"paramLongName": "resultPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
}
]

View File

@ -0,0 +1,35 @@
[{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
}
,{
"paramName": "ssp",
"paramLongName": "scholixSummaryPath",
"paramDescription": "the path of the scholix summaries",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the openaire graph input path",
"paramRequired": false
},{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
},{
"paramName": "sep",
"paramLongName": "scholixEnrichedPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
},{
"paramName": "oep",
"paramLongName": "openaireEnrichedPath",
"paramDescription": "the output path for the selected scholix relations",
"paramRequired": false
}
]

View File

@ -0,0 +1,77 @@
<configuration>
<!-- OCEAN -->
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<!-- <property>-->
<!-- <name>spark2YarnHistoryServerAddress</name>-->
<!-- <value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>-->
<!-- </property>-->
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<!-- GARR -->
<!-- <property>-->
<!-- <name>jobTracker</name>-->
<!-- <value>yarn</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>nameNode</name>-->
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>hiveMetastoreUris</name>-->
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2YarnHistoryServerAddress</name>-->
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
<!-- </property>-->
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<!-- <property>-->
<!-- <name>spark2EventLogDir</name>-->
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2ExtraListeners</name>-->
<!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
<!-- </property>-->
</configuration>

View File

@ -0,0 +1,290 @@
<workflow-app name="Context Propagation Preparation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>scholixPath</name>
<description>the Scholix Path</description>
</property>
<property>
<name>scholixSummaryPath</name>
<description>the Scholix Summaries Path</description>
</property>
<property>
<name>inputPath</name>
<description>the OpenAIRE Graph Input Path</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="select_scholix_relations"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="select_scholix_relations">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Select Scholix Relations</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkSelectScholixRelations</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${scholixPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/scholixAllowedRelations</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="enrich_scholix_step1"/>
<error to="Kill"/>
</action>
<action name="enrich_scholix_step1">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrich Scholix Step1</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep1</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/scholixAllowedRelations</arg>
<arg>--scholixSummaryPath</arg><arg>${scholixSummaryPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/scholixEnriched</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="fork_enrich_scholix_step2"/>
<error to="Kill"/>
</action>
<fork name="fork_enrich_scholix_step2">
<path start="enrich_scholix_step2_publication"/>
<path start="enrich_scholix_step2_dataset"/>
<path start="enrich_scholix_step2_software"/>
<path start="enrich_scholix_step2_orp"/>
</fork>
<action name="enrich_scholix_step2_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
<arg>--resultPath</arg><arg>${inputPath}/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/publication</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="join_enrich"/>
<error to="Kill"/>
</action>
<action name="enrich_scholix_step2_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
<arg>--resultPath</arg><arg>${inputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/dataset</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="join_enrich"/>
<error to="Kill"/>
</action>
<action name="enrich_scholix_step2_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
<arg>--resultPath</arg><arg>${inputPath}/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/software</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="join_enrich"/>
<error to="Kill"/>
</action>
<action name="enrich_scholix_step2_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep2</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixPath</arg><arg>${scholixPath}</arg>
<arg>--relationPath</arg><arg>${inputPath}/relation</arg>
<arg>--resultPath</arg><arg>${inputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/openaireEnriched/otherresearchproduct</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="join_enrich"/>
<error to="Kill"/>
</action>
<join name="join_enrich" to="enrich_scholix_step3"/>
<action name="enrich_scholix_step3">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Enrcih Scholix Step2</name>
<class>eu.dnetlib.dhp.contextpropagation.SparkEnrichScholixStep3</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--scholixEnrichedPath</arg><arg>${workingDir}/scholixEnriched</arg>
<arg>--openaireEnrichedPath</arg><arg>${workingDir}/openaireEnriched</arg>
<arg>--outputPath</arg><arg>${workingDir}/enrichedEntities</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,22 @@
package eu.dnetlib.dhp.contextpropagation
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, RelationPropagation}
import eu.dnetlib.dhp.provision.scholix.Scholix
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import org.apache.spark.sql.{Encoder, Encoders}
class PropagationTest extends java.io.Serializable {
val m: ObjectMapper = new ObjectMapper()
m.enable(SerializationFeature.INDENT_OUTPUT)
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
}

View File

@ -0,0 +1,144 @@
package eu.dnetlib.dhp.contextpropagation
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.contextpropagation.model.{EnrichedEntries, RelationPropagation}
import eu.dnetlib.dhp.provision.scholix.Scholix
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.Assertions.{assertFalse, assertNotNull}
class ScholixTest extends java.io.Serializable{
val m: ObjectMapper = new ObjectMapper()
m.enable(SerializationFeature.INDENT_OUTPUT)
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
implicit val enrichedEncoder: Encoder[EnrichedEntries] = Encoders.kryo[EnrichedEntries]
implicit val tupleForJoinEncoder: Encoder[(String, EnrichedEntries)] = Encoders.tuple(Encoders.STRING, enrichedEncoder)
@Test
def selectScholexplorerRelationTest(): Unit ={
val sourcePath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/scholix-relations-00000.parquet").getPath
val conf : SparkConf = new SparkConf()
val spark: SparkSession = SparkSession.builder().appName("SelectScholixRelationTest").master("local").config(conf).getOrCreate()
val tmp = SparkSelectScholixRelations.getAllowedRelations(sourcePath, spark)
tmp.write.mode(SaveMode.Overwrite).save("/tmp/temp")
assert(tmp.count > 0)
}
@Test
def SelectDistinctIDTest(): Unit ={
val sourcePath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/producedInfo/selectedRelations.parquet").getPath
val conf : SparkConf = new SparkConf()
val spark: SparkSession = SparkSession.builder().appName("SelectDistinctIdsTest").master("local").config(conf).getOrCreate()
implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
val allowedRelations = spark.read.load(sourcePath).as[RelationPropagation]
val numberOfNodes = allowedRelations.map(r => r.getSource.getId)(Encoders.STRING)
.union(allowedRelations.map(r => r.getTarget.getId)(Encoders.STRING)).count()
val tmp : Dataset[String]= PropagationUtils.getSelectedNodes(sourcePath, spark)
assert (numberOfNodes > tmp.count())
}
@Test
def mappingScholixOpenAIRETest(): Unit ={
val sourcePath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/scholix-relations-00000.parquet").getPath
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
//val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
val spark: SparkSession = SparkSession.builder().appName("Test").master("local").config(new SparkConf()).getOrCreate()
val tmp = SparkEnrichScholixStep2.getMappingScholexplorerOpenAIRE(sourcePath, spark)
tmp.filter(e => e.getScholixId.contains("dedup"))
.foreach(e => assertFalse(!(e.getScholixId.substring(17).equals(e.getOaid.substring(17)))))
tmp.filter(e => !e.getScholixId.contains("dedup"))
.foreach(e => assertFalse(!(e.getOaid.substring(17).equals(e.getScholixId.substring(3)))))
}
@Test
def enrichScholixTest():Unit = {
val summaryPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/part-00000-summaries.parquet").getPath
val relationPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/producedInfo/selectedRelations.parquet").getPath
val conf : SparkConf = new SparkConf()
val spark: SparkSession = SparkSession.builder().config(conf) .appName("Test").master("local").getOrCreate()
val tmp = SparkEnrichScholixStep1.getEnrichedSubset(relationPath, summaryPath, spark)
assert(tmp.count() == 5)
//tmp.write.mode(SaveMode.Overwrite).save("/tmp/scholixEnriched")
}
@Test
def enrichOpenAIRETest():Unit = {
val conf : SparkConf = new SparkConf()
val spark: SparkSession = SparkSession.builder().config(conf) .appName("Test").master("local").getOrCreate()
val scholixPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/scholix-relations-00000.parquet").getPath
val relationPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/relation.json").getPath
val resultPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/input/result/publication").getPath
val tmp = SparkEnrichScholixStep2.getEnrichedSubset(scholixPath, relationPath , resultPath , spark)
print(tmp.count())
assert(tmp.count() == 1)
tmp.write.mode(SaveMode.Overwrite).save("/tmp/openaireEnriched")
}
@Test
def mergeEnrichmentsTest():Unit = {
val conf : SparkConf = new SparkConf()
val spark: SparkSession = SparkSession.builder().config(conf) .appName("Test").master("local").getOrCreate()
val scholixPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/producedInfo/scholixEnriched.parquet").getPath
val resultPath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/producedInfo/result").getPath
val tmp = SparkEnrichScholixStep3.getEnriched(scholixPath, resultPath , spark)
assert(tmp.count() == 5)
tmp.write.mode(SaveMode.Overwrite).save("/tmp/mergedEnriched")
tmp.foreach(r => print(m.writeValueAsString(r)))
}
}

View File

@ -0,0 +1,169 @@
package eu.dnetlib.dhp.contextpropagation
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.contextpropagation.model.{DatasetPropagationStructure, EnrichedEntries, MapSxOA, Node, PropagationStructure, PropagationUse, RelationPropagation}
import eu.dnetlib.dhp.provision.scholix.summary.{SchemeValue, ScholixSummary}
import eu.dnetlib.dhp.provision.scholix.{Scholix, ScholixEntityId}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions.{assertFalse, assertNotNull}
import org.junit.jupiter.api.Test
import scala.collection.JavaConverters._
class TestProva extends java.io.Serializable{
val m: ObjectMapper = new ObjectMapper()
m.enable(SerializationFeature.INDENT_OUTPUT)
// @Test
// def testFunderRelationshipsMapping(): Unit = {
//
//
// def findInDats(dats: Dataset[(String, DatasetPropagationStructure)], elem:String) : Dataset[(String, DatasetPropagationStructure)] = {
// dats.filter(dats("_1") === elem)
// }
//
//
// val sourcePath = getClass.getResource("/eu/dnetlib/dhp/contextpropagation/part-00000.parquet").getPath
//
//
// implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
// implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
// implicit val propagationEncoder: Encoder[RelationPropagation] = Encoders.kryo[RelationPropagation]
// implicit val mapEncoderPub: Encoder[PropagationStructure] = Encoders.kryo[PropagationStructure]
// implicit val mapEncoderDats: Encoder[DatasetPropagationStructure] = Encoders.kryo[DatasetPropagationStructure]
// implicit val tupleForPropagation: Encoder[(String, PropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
// implicit val tupleForPropagationDars: Encoder[(String, DatasetPropagationStructure)] = Encoders.tuple(Encoders.STRING, mapEncoderDats)
// implicit val stringEncoder: Encoder[String] = Encoders.STRING
//
//
// val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
//
//
// val ds: Dataset[Scholix] = spark.read.load(sourcePath).as[Scholix]
//
// val allowedRelations : Dataset[RelationPropagation] = ds
// .filter(s => !s.getSource().getDnetIdentifier().substring(0,2).equals("70") )
// .filter(s => !s.getTarget().getDnetIdentifier().substring(0,2).equals("70"))
// .map(s => {
// val rp = new RelationPropagation
// rp.setSource(Node.newInstance(s.getSource.getDnetIdentifier))//, getPublisherList(s.getSource.getPublisher.asScala.toList)))
// rp.setTarget(Node.newInstance(s.getTarget.getDnetIdentifier))//, getPublisherList(s.getTarget.getPublisher.asScala.toList)))
// rp.setSemantics(s.getRelationship.getName)
// rp
// })
//
//
// //println(allowedRelations.count())
//
// val pubs_rel : Dataset[RelationPropagation] = allowedRelations.filter(r => r.getSource.getId.startsWith("50"))
// .filter(r => r.getTarget.getId.startsWith("60")).filter(r => Costants.containedInPubSem(r.getSemantics.toLowerCase()))
//
// val dats_rel : Dataset[RelationPropagation] = allowedRelations
// .filter(r => r.getSource.getId.startsWith("60")
// && r.getTarget.getId.startsWith("60")
// && Costants.containedInDatsSem(r.getSemantics.toLowerCase())
// && r.getSource.getId != r.getTarget.getId)
//
// val publication_dataset : Dataset[(String, PropagationStructure)] = pubs_rel.map(r => {
// val ps = new PropagationStructure
//
// val pv : List[PropagationUse] = List(PropagationUse.copyInstance(Costants.getPublicationValue(r.getSemantics.toLowerCase())))
// ps.add(r.getSource.getId, pv.asJava)
// (r.getTarget.getId, ps)
//
// })
//
//
// val pl1 : Dataset[(String, PropagationStructure)] = publication_dataset.groupByKey(_._1)(Encoders.STRING)
// .agg(PropagationAggregator.getDatasetAggregator().toColumn)
//
//
//
//
//
// // print(pl1.count)
//
// val dataset_dataset : Dataset[(String, DatasetPropagationStructure)] = dats_rel.map(r => {
// val ps = new DatasetPropagationStructure
//
// ps.add(r.getTarget.getId, PropagationUse.copyInstance(Costants.getDatasetValue(r.getSemantics.toLowerCase())))
// (r.getSource.getId, ps)
//
// })
////
//// //pl1.foreach(r => print(m.writeValueAsString(r._1)))
////
////
////
// val dataset_dataset_modified : Dataset[(String, DatasetPropagationStructure)] =
// dataset_dataset.map(ds => {
// if(ds._1 == "60|4b5e9fa8e91b206001589993179f69d1"){
// ("60|82368200e90cf75c714b58288a371bbe", ds._2)
// }
// else{
// ds
// }
// })
////
//// // findInDats(dataset_dataset_modified, "60|82368200e90cf75c714b58288a371bbe").show(false)
////
////
// val pl2_step1 = pl1.joinWith(dataset_dataset_modified, pl1("value")
// .equalTo(dataset_dataset_modified("_1")), "left")
// .flatMap(PropagationUtils.propagateDataset)
//
//
//
//
// val pl2= pl2_step1.groupByKey(_._1)(Encoders.STRING).agg(PropagationAggregator.getDatasetAggregator().toColumn)
// print(pl2.count())
//
//
//// pl1.foreach(i=> {
//// if (i._1 =="60|b91b1296e3e37523887c2eaaf3f2e673")
//// print(m.writeValueAsString(i))
//// })
////
//// print(pl1.count)
//
////
//
// // print(m.writeValueAsString(dsprob.getPropagation.get(source).getUse))
//
//// print(dataset_dataset.map(d => {
//// var found : Boolean = false
//// for (elem <- d._2.getPropagation.keySet().asScala){
//// if (d._2.getPropagation.get(elem).getUse == "proxy"){
//// found = true
//// }
//// }
//// if (found){
//// d
//// }else{
//// null
//// }
//// }).filter(o => o != null).first()._1)
//
//
//// dataset_dataset.foreach(d => {
////
//// for (elem <- d._2.getPropagation.keySet().asScala){
//// if (d._2.getPropagation.get(elem).getUse == "reuse"){
//// print("reuse")
//// }
//// }
//// println()
//// })
//
// }
}

View File

@ -0,0 +1,7 @@
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-prod","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"properties":[],"relClass":"merges","relType":"resultResult","source":"50|dedup_wf_001::000239c0f7ec8507afd7e02b4a853b56","subRelType":"dedup","target":"50|scholix_____::e6c6f093eb4f8c48201c157f5fcdd8f8"}
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite"}],"dataInfo":{"deletedbyinference":false,"invisible":false,"trust":"0.9"},"properties":[],"relClass":"merges","relType":"resultResult","source":"50|dedup_wf_001::002cf1de4469a0a318fdd1ff009659ec","subRelType":"relationship","target":"50|scholix_____::08d3a09fc700d2f614556cdd23762ad7"}
{"collectedfrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite"}],"dataInfo":{"deletedbyinference":false,"invisible":false,"trust":"0.9"},"properties":[],"relClass":"merges","relType":"resultResult","source":"50|dedup_wf_001::00310fc57d006a502e06411f3ab35424","subRelType":"relationship","target":"50|scholix_____::e17f731657c15f24c42bfca61c26b113"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_similarities_standard","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1612386060902,"properties":[{"key":"similarityLevel","value":"0.7032"}],"relClass":"hasAmongTopNSimilarDocuments","relType":"resultResult","source":"50|dedup_wf_001::0031d1f2103ebb2979c785e1b00b2319","subRelType":"similarity","target":"50|dedup_wf_001::c8ae7b6f575767dbebb18d58870b582b"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"decisiontree-dedup-prod","inferred":true,"invisible":false,"provenanceaction":{"classid":"sysimport:dedup","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"properties":[],"relClass":"isMergedIn","relType":"resultResult","source":"50|datacite____::f97dc7ffbd237a68b9954095dd56dd91","subRelType":"dedup","target":"50|dedup_wf_001::569ad6db85b9568dfbd388a749c479f8"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_referencedProjects","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.6573"},"lastupdatetimestamp":1612386075869,"properties":[],"relClass":"produces","relType":"resultProject","source":"40|aka_________::02c787a3a97d7bd6946672a8ec74ecfe","subRelType":"outcome","target":"50|dedup_wf_001::27514076973e90990e5cd9205fcc5317"}
{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:project:semrel","classname":"result:project:semrel","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"trust":"0.85"},"properties":[],"relClass":"produces","relType":"resultProject","source":"40|anr_________::d3c7c989a9e114593c7cb8f77edde5a3","subRelType":"outcome","target":"50|scholix_____::b79951545b294686860f14471f174ccc"}

Some files were not shown because too many files have changed in this diff Show More