Compare commits

...

34 Commits

Author SHA1 Message Date
Claudio Atzori ef612105ba [broker] added coalesce(1) on the stats dataset before storing it on postgres 2021-07-09 15:47:06 +02:00
Miriam Baglioni c64c0a0743 Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop 2021-06-29 17:59:40 +02:00
Miriam Baglioni 8abdd9bad2 added step of normalization for the doi 2021-06-29 17:59:37 +02:00
Miriam Baglioni e1cd2e406e added class to test the normalization of dois 2021-06-29 17:55:03 +02:00
Miriam Baglioni 29828bc273 added resource to test the normalization of doi during the import of MAG 2021-06-29 17:54:14 +02:00
Miriam Baglioni 0f402a44fb slight modification of the resource to accomodate also doi normalization tests 2021-06-29 17:53:49 +02:00
Miriam Baglioni 2aa565ee6c added close command for the SparkContext 2021-06-29 17:53:10 +02:00
Miriam Baglioni d5d21254a2 added tests for the normalization of the dois 2021-06-29 17:50:07 +02:00
Miriam Baglioni 5e2f330239 added tests for the normalization of the dois 2021-06-29 17:49:39 +02:00
Miriam Baglioni 8320ad2248 added tests for the normalization of the dois 2021-06-29 17:49:11 +02:00
Miriam Baglioni 011e629df5 there is no more the need to lowerCase the doi since the normalized doi is saved in the first phase 2021-06-29 17:48:42 +02:00
Miriam Baglioni 22ae8a81c2 added the normalization step to the doi 2021-06-29 17:45:24 +02:00
Miriam Baglioni 779015e4a9 added the normalization step to the doi from crossref 2021-06-29 14:56:58 +02:00
Miriam Baglioni 3dd5701948 added the normalization step to the doi from crossref 2021-06-29 12:10:27 +02:00
Miriam Baglioni a5c1c0e90a added the normalization step to the doi from orcid before returning it 2021-06-29 12:03:54 +02:00
Miriam Baglioni dc5ed6f563 Added method to normalize doi values (lower case, remove all preceeding 10., filtering out doi not starting with 10.) 2021-06-29 12:03:13 +02:00
Miriam Baglioni 7e13817262 Aggiornare 'dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java'
fixed a problem in sql query
2021-06-24 15:32:21 +02:00
Miriam Baglioni 59c36eb185 check if pid is null (to avoid NullPointerException) 2021-06-21 10:41:47 +02:00
Claudio Atzori 6b8c357381 removed extra whitespace at the end of the file 2021-06-18 16:08:45 +02:00
Claudio Atzori c0d2b62e46 [doiboost] added missing implicit Encoder 2021-06-18 15:57:41 +02:00
Claudio Atzori a3948c1f6e cleanup old doiboost workflows 2021-06-18 15:14:08 +02:00
Claudio Atzori fddbc8364e Merge branch 'alessia.bardi-datepicker' 2021-06-17 09:24:46 +02:00
Alessia Bardi 6208b04f1d smarter DatePicker for ISO dates on dateofacceptance 2021-06-16 14:56:26 +02:00
Sandro La Bruzzo 9ca438d9b1 imported from branch stable_ids generation of Actionset datacite 2021-06-10 14:59:45 +02:00
Sandro La Bruzzo 42ff7a5665 some fix to the pom to compile scala 2021-06-10 14:31:06 +02:00
Sandro La Bruzzo ebe6aa6d38 implemented datacite transformation also on master 2021-06-10 10:52:36 +02:00
Claudio Atzori a4cfabdbc6 Merge pull request 'master' (#111) from antonis.lempesis/dnet-hadoop:master into master
Reviewed-on: #111
2021-05-28 14:09:12 +02:00
Claudio Atzori 338327171d integrating pull #109, H2020Classification 2021-05-27 11:57:01 +02:00
Claudio Atzori 6cbda49112 more pervasive use of constants from ModelConstants, especially for ORCID 2021-05-26 18:13:04 +02:00
Claudio Atzori ea9b00ce56 adjusted test 2021-05-20 15:31:42 +02:00
Claudio Atzori 2e70aa43f0 Merge pull request 'H2020Classification fix and possibility to add datasources in blacklist for propagation of result to organization' (#108) from miriam.baglioni/dnet-hadoop:master into master
Reviewed-on: #108

The changes look ok, but please drop a comment to describe how the parameters should be changed from the workflow caller for both workflows
* H2020Classification
* propagation of result to organization
2021-05-20 15:25:05 +02:00
Antonis Lempesis 168edcbde3 added the final steps for the observatory promote wf and some cleanup 2021-05-18 15:23:20 +03:00
Antonis Lempesis 625d993cd9 added step for observatory db 2021-04-20 02:31:06 +03:00
Antonis Lempesis 25d0512fbd code cleanup 2021-04-20 01:43:23 +03:00
57 changed files with 4988 additions and 559 deletions

View File

@ -115,6 +115,8 @@ public class AuthorMerger {
}
public static String pidToComparableString(StructuredProperty pid) {
if (pid == null)
return "";
return (pid.getQualifier() != null
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
: "")

View File

@ -7,6 +7,37 @@
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
@ -24,12 +55,6 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>com.sun.xml.bind</groupId>
<artifactId>jaxb-core</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
@ -37,6 +62,13 @@
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-graph-mapper</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>

View File

@ -0,0 +1,544 @@
package eu.dnetlib.dhp.actionmanager.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.{Date, Locale}
import java.util.regex.Pattern
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
object DataciteToOAFTransformation {
val UNKNOWN_REPOSITORY_ORIGINALID = "openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18"
val DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254"
val DNET_DATACITE_DATE = "dnet:dataCite_date"
val DNET_DATACITE_TITLE = "dnet:dataCite_title"
val SYSIMPORT_ACTIONSET = "sysimport:actionset"
val DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"
val PROVENANCE_ACTION_SET_QUALIFIER: Qualifier = OafMapperUtils.qualifier(SYSIMPORT_ACTIONSET, SYSIMPORT_ACTIONSET, DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS)
val MAIN_TITLE_QUALIFIER:Qualifier = OafMapperUtils.qualifier("main title","main title",DNET_DATACITE_TITLE,DNET_DATACITE_TITLE)
implicit val codec: Codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val j_filter: List[String] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
s.lines.toList
}
val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val dataInfo: DataInfo = generateDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(DATACITE_ID, "Datacite")
val hostedByMap: Map[String, HostedByMapType] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(s)
json.extract[Map[String, HostedByMapType]]
}
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
def filter_json(json: String): Boolean = {
j_filter.exists(f => json.contains(f))
}
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
}
).find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
case _: Throwable => try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
return None
}
}
}
d
}
def fix_thai_date(input:String, format:String) :String = {
try {
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
null
}
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
i.setInstancetype(typeQualifiers._1)
typeQualifiers._2.getClassname match {
case "dataset" =>
val r = new OafDataset
r.setInstance(List(i).asJava)
return r
case "publication" =>
val r = new Publication
r.setInstance(List(i).asJava)
return r
case "software" =>
val r = new Software
r.setInstance(List(i).asJava)
return r
case "other" =>
val r = new OtherResearchProduct
r.setInstance(List(i).asJava)
return r
}
null
}
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val l: List[String] = for {
JObject(dates) <- json \\ "dates"
JField("dateType", JString(dateTypes)) <- dates
} yield dateTypes
l.exists(p => p.equalsIgnoreCase("available"))
}
def OPEN_ACCESS_RIGHT = {
val result = new Qualifier
result.setClassid("OPEN")
result.setClassid("OPEN")
result.setSchemeid(ModelConstants.DNET_ACCESS_MODES)
result.setSchemename(ModelConstants.DNET_ACCESS_MODES)
result
}
/**
* As describe in ticket #6377
* when the result come from figshare we need to remove subject
* and set Access rights OPEN.
* @param r
*/
def fix_figshare(r: Result): Unit = {
if (r.getInstance() != null) {
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(OPEN_ACCESS_RIGHT))
val l: List[StructuredProperty] = List()
r.setSubject(l.asJava)
}
}
}
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
if (match_pattern.isDefined) {
val m = match_pattern.get._1
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
)
}
else
List()
}
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup): List[Oaf] = {
if (filter_json(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
val doi = (json \ "attributes" \ "doi").extract[String]
if (doi.isEmpty)
return List()
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (result == null)
return List()
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava)
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
result.setOriginalId(List(doi).asJava)
val d = new Date(dateOfCollection * 1000)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
result.setDateofcollection(ISO8601FORMAT.format(d))
result.setDateoftransformation(ISO8601FORMAT.format(ts))
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val authors = creators.zipWithIndex.map { case (c, idx) =>
val a = new Author
a.setFullname(c.name.orNull)
a.setName(c.givenName.orNull)
a.setSurname(c.familyName.orNull)
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
a.setPid(c.nameIdentifiers.get.map(ni => {
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
}
else
null
}
)
.asJava)
}
if (c.affiliation.isDefined)
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
a.setRank(idx + 1)
a
}
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils.structuredProperty(t.title.get, MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, DNET_DATACITE_TITLE, DNET_DATACITE_TITLE, null)
}
}).asJava)
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
result.setAuthor(authors.asJava)
val dates = (json \\ "dates").extract[List[DateType]]
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
val i_date = dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
.map(d => extract_date(d.date.get))
val a_date: Option[String] = dates
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
.map(d => extract_date(d.date.get))
.find(d => d != null && d.isDefined)
.map(d => d.get)
if (a_date.isDefined) {
if(doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
}
else if (publication_year != null) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d => (d._1.get, vocabularies.getTermAsQualifier(DNET_DATACITE_DATE, d._2.toLowerCase())))
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2)).asJava)
val subjects = (json \\ "subjects").extract[List[SubjectType]]
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
result.setDescription(
descriptions
.filter(d => d.description.isDefined).
map(d =>
OafMapperUtils.field(d.description.get, null)
).filter(s => s != null).asJava)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
val instance = result.getInstance().get(0)
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
val accessRights: List[String] = for {
JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[Qualifier] = accessRights.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
}).find(q => q != null).map(q => {
val a = new Qualifier
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.qualifier(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
if (client.isDefined) {
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
val license = accessRights
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
}
val awardUris: List[String] = for {
JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
val relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
}
else
List(result)
}
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(PROVENANCE_ACTION_SET_QUALIFIER)
di
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object ExportActionSetJobNode {
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(ExportActionSetJobNode.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
spark.read.load(sourcePath).as[Oaf]
.map(o =>DataciteToOAFTransformation.toActionSet(o))
.filter(o => o!= null)
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
import spark.implicits._
spark.read.load(sourcePath).as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies))
.filter(d => d != null)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -18,7 +18,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
@ -33,7 +32,6 @@ public class PrepareProjects {
private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils

View File

@ -0,0 +1,28 @@
TUBYDI - Assistir Filmes e Series Online Grátis
123Movies
WATCH FULL MOVIE
Movierulz
Full Movie Online
MOVIé WatcH
The King of Staten Island 2020 Online For Free
Watch Train to Busan 2 2020 online for free
Sixth Sense Movie Novelization
Film Complet streaming vf gratuit en ligne
watch now free
LIVE stream watch
LIVE stream UFC
RBC Heritage live stream
MLBStreams Free
NFL Live Stream
Live Stream Free
Royal Ascot 2020 Live Stream
TV Shows Full Episodes Official
FuboTV
Gomovies
Online Free Trial Access
123watch
DÜŞÜK HAPI
Bebek Düşürme Yöntemleri
WHATSAP İLETİŞİM
Cytotec
düşük hapı

View File

@ -0,0 +1,21 @@
[
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source mdstore path",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the target mdstore path",
"paramRequired": true
},
{
"paramName": "m",
"paramLongName": "master",
"paramDescription": "the master name",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source mdstore path",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the target mdstore path",
"paramRequired": true
},
{
"paramName": "m",
"paramLongName": "master",
"paramDescription": "the master name",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "isLookupUrl",
"paramDescription": "the isLookup URL",
"paramRequired": true
}
]

View File

@ -0,0 +1,23 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,46 @@
<workflow-app name="Import_Datacite_and_transform_to_OAF" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>mainPath</name>
<description>the working path of Datacite stores</description>
</property>
<property>
<name>isLookupUrl</name>
<description>The IS lookUp service endopoint</description>
</property>
</parameters>
<start to="TransformJob"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="TransformJob">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>TransformJob</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
<arg>--targetPath</arg><arg>${mainPath}/production/datacite_oaf</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,23 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,46 @@
<workflow-app name="Datacite_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the working path of Datacite stores</description>
</property>
<property>
<name>outputPath</name>
<description>the path of Datacite ActionSet</description>
</property>
</parameters>
<start to="ExportDataset"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ExportDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExportDataset</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${outputPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,50 @@
package eu.dentlib.dhp.aggregation;
import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import org.apache.commons.io.IOUtils;
import org.mockito.Mock;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public abstract class AbstractVocabularyTest {
@Mock
protected ISLookUpService isLookUpService;
protected VocabularyGroup vocabularies;
public void setUpVocabulary() throws ISLookUpException, IOException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
}
private static List<String> vocs() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/vocabulary/terms.txt")));
}
private static List<String> synonyms() throws IOException {
return IOUtils
.readLines(
Objects
.requireNonNull(
AbstractVocabularyTest.class.getResourceAsStream("/eu/dnetlib/dhp/vocabulary/synonyms.txt")));
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dentlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import org.codehaus.jackson.map.ObjectMapper
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class DataciteToOAFTest extends AbstractVocabularyTest{
@BeforeEach
def setUp() :Unit = {
super.setUpVocabulary()
}
@Test
def testMapping() :Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("datacite.json")).mkString
val mapper = new ObjectMapper()
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies )
println (mapper.defaultPrettyPrintingWriter().writeValueAsString(res.head))
}
@Test
def testDate():Unit = {
println(DataciteToOAFTransformation.fix_thai_date("01-01-2561","[dd-MM-yyyy]"))
println(DataciteToOAFTransformation.fix_thai_date("2561-01-01","[yyyy-MM-dd]"))
}
}

File diff suppressed because one or more lines are too long

View File

@ -13,6 +13,7 @@ import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.TypedColumn;
@ -24,6 +25,7 @@ import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.DatasourceStats;
import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.StatsAggregator;
import scala.Tuple2;
public class GenerateStatsJob {
@ -71,9 +73,14 @@ public class GenerateStatsJob {
ClusterUtils
.readPath(spark, eventsPath, Event.class)
.groupByKey(e -> e.getTopic() + "@@@" + e.getMap().getTargetDatasourceId(), Encoders.STRING())
.groupByKey(
(MapFunction<Event, String>) e -> e.getTopic() + "@@@" + e.getMap().getTargetDatasourceId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(DatasourceStats.class))
.map(
(MapFunction<Tuple2<String, DatasourceStats>, DatasourceStats>) t -> t._2,
Encoders.bean(DatasourceStats.class))
.coalesce(1)
.write()
.mode(SaveMode.Overwrite)
.jdbc(dbUrl, "oa_datasource_stats_temp", connectionProperties);

View File

@ -26,6 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
@ -144,7 +145,7 @@ public class ConversionUtils {
.filter(pid -> pid != null)
.filter(pid -> pid.getQualifier() != null)
.filter(pid -> pid.getQualifier().getClassid() != null)
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
.map(pid -> pid.getValue())
.map(pid -> cleanOrcid(pid))
.filter(StringUtils::isNotBlank)

View File

@ -18,7 +18,7 @@ import eu.dnetlib.dhp.schema.oaf.Field;
public class DatePicker {
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";
private static final String DATE_PATTERN = "^(\\d{4})-(\\d{2})-(\\d{2})";
private static final String DATE_DEFAULT_SUFFIX = "01-01";
private static final int YEAR_LB = 1300;
private static final int YEAR_UB = Year.now().getValue() + 5;
@ -28,6 +28,7 @@ public class DatePicker {
final Map<String, Integer> frequencies = dateofacceptance
.parallelStream()
.filter(StringUtils::isNotBlank)
.map(d -> substringBefore(d, "T"))
.collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum));
if (frequencies.isEmpty()) {

View File

@ -0,0 +1,44 @@
package eu.dnetlib.dhp.oa.dedup;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collection;
import org.junit.jupiter.api.Test;
import com.clearspring.analytics.util.Lists;
public class DatePickerTest {
Collection<String> dates = Lists.newArrayList();
@Test
public void testPickISO() {
dates.add("2016-01-01T12:00:00Z");
dates.add("2016-06-16T12:00:00Z");
dates.add("2020-01-01T12:00:00Z");
dates.add("2020-10-01T12:00:00Z");
assertEquals("2020-10-01", DatePicker.pick(dates).getValue());
}
@Test
public void testPickSimple() {
dates.add("2016-01-01");
dates.add("2016-06-16");
dates.add("2020-01-01");
dates.add("2020-10-01");
assertEquals("2020-10-01", DatePicker.pick(dates).getValue());
}
@Test
public void testPickFrequent() {
dates.add("2016-02-01");
dates.add("2016-02-01");
dates.add("2016-02-01");
dates.add("2020-10-01");
assertEquals("2016-02-01", DatePicker.pick(dates).getValue());
}
}

View File

@ -5,6 +5,7 @@ import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue,
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
@ -28,7 +29,6 @@ object DoiBoostMappingUtil {
//STATIC STRING
val MAG = "microsoft"
val MAG_NAME = "Microsoft Academic Graph"
val ORCID = "orcid"
val ORCID_PENDING = "orcid_pending"
val CROSSREF = "Crossref"
val UNPAYWALL = "UnpayWall"
@ -37,8 +37,9 @@ object DoiBoostMappingUtil {
val doiBoostNSPREFIX = "doiboost____"
val OPENAIRE_PREFIX = "openaire____"
val SEPARATOR = "::"
val DNET_LANGUAGES = "dnet:languages"
val PID_TYPES = "dnet:pid_types"
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)";
val DOI_PREFIX = "10.";
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
@ -326,8 +327,8 @@ object DoiBoostMappingUtil {
def createORIDCollectedFrom(): KeyValue = {
val cf = new KeyValue
cf.setValue(ORCID)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
cf.setValue(ModelConstants.ORCID_DS)
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ModelConstants.ORCID))
cf
}
@ -373,4 +374,25 @@ object DoiBoostMappingUtil {
}
def isEmpty(x: String) = x == null || x.trim.isEmpty
def normalizeDoi(input : String) :String ={
val replaced = input.replaceAll("(?:\\n|\\r|\\t|\\s)", "").toLowerCase.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
if (isEmpty(replaced))
return null
if(replaced.indexOf("10.") < 0)
return null
val ret = replaced.substring(replaced.indexOf("10."))
if (!ret.startsWith(DOI_PREFIX))
return null
return ret
}
}

View File

@ -15,6 +15,7 @@ import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils
import eu.dnetlib.doiboost.DoiBoostMappingUtil
case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
@ -86,8 +87,8 @@ case object Crossref2Oaf {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID
val doi: String = (json \ "DOI").extract[String]
result.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
val doi: String = DoiBoostMappingUtil.normalizeDoi((json \ "DOI").extract[String])
result.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
//MAPPING Crossref DOI into OriginalId
//and Other Original Identifier of dataset like clinical-trial-number

View File

@ -1,6 +1,7 @@
package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.SparkConf
@ -22,7 +23,7 @@ object CrossrefDataset {
lazy val json: json4s.JValue = parse(input)
val ts:Long = (json \ "indexed" \ "timestamp").extract[Long]
val doi:String = (json \ "DOI").extract[String]
CrossrefDT(doi, input, ts)
CrossrefDT(DoiBoostMappingUtil.normalizeDoi(doi), input, ts)
}

View File

@ -33,9 +33,9 @@ object SparkMapDumpIntoOAF {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
val targetPath = parser.get("targetPath")
import spark.implicits._
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
.flatMap(k => Crossref2Oaf.convert(k.json))

View File

@ -188,11 +188,11 @@ case object ConversionUtil {
val authors = inputParams._2
val pub = new Publication
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
pub.setPid(List(createSP(paper.Doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi).asJava)
//Set identifier as 50|doiboost____::md5(DOI)
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
pub.setId(generateIdentifier(pub, paper.Doi))
val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title")
val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title")
@ -247,11 +247,11 @@ case object ConversionUtil {
val description = inputParams._2
val pub = new Publication
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
pub.setPid(List(createSP(paper.Doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi).asJava)
//Set identifier as 50 | doiboost____::md5(DOI)
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
pub.setId(generateIdentifier(pub, paper.Doi))
val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title")
val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title")

View File

@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.mag
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
@ -12,6 +13,23 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkProcessMAG {
def getDistinctResults (d:Dataset[MagPapers]):Dataset[MagPapers]={
d.where(col("Doi").isNotNull)
.groupByKey(mp => DoiBoostMappingUtil.normalizeDoi(mp.Doi))(Encoders.STRING)
.reduceGroups((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2))
.map(_._2)(Encoders.product[MagPapers])
.map(mp => {
new MagPapers(mp.PaperId, mp.Rank, DoiBoostMappingUtil.normalizeDoi(mp.Doi),
mp.DocType, mp.PaperTitle, mp.OriginalTitle,
mp.BookTitle, mp.Year, mp.Date, mp.Publisher: String,
mp.JournalId, mp.ConferenceSeriesId, mp.ConferenceInstanceId,
mp.Volume, mp.Issue, mp.FirstPage, mp.LastPage,
mp.ReferenceCount, mp.CitationCount, mp.EstimatedCitation,
mp.OriginalVenue, mp.FamilyId, mp.CreatedDate)
})(Encoders.product[MagPapers])
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
@ -33,17 +51,11 @@ object SparkProcessMAG {
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
logger.info("Phase 1) make uninque DOI in Papers:")
logger.info("Phase 1) make unique DOI in Papers:")
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
.rdd
.map{ p: MagPapers => Tuple2(p.Doi, p) }
.reduceByKey((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2))
.map(_._2)
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
val distinctPaper: Dataset[MagPapers] = getDistinctResults(d)
distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")

View File

@ -1,5 +1,7 @@
package eu.dnetlib.doiboost.orcid
import java.util.stream.Collectors
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
@ -82,8 +84,9 @@ object ORCIDToOAF {
JObject(extIds) <- json \ "workDetail" \"extIds"
JField("type", JString(typeValue)) <- extIds
JField("value", JString(value)) <- extIds
if "doi".equalsIgnoreCase(typeValue)
} yield (typeValue, value)
normalized_value: String = DoiBoostMappingUtil.normalizeDoi(value)
if "doi".equalsIgnoreCase(typeValue) && normalized_value != null
} yield (typeValue, normalized_value)
if (doi.nonEmpty) {
return doi.map(l =>OrcidWork(oid, l._2))
}

View File

@ -30,7 +30,6 @@ public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
public static final String ORCID = "ORCID";
public final static String orcidPREFIX = "orcid_______";
public static final String OPENAIRE_PREFIX = "openaire____";
public static final String SEPARATOR = "::";
@ -69,7 +68,9 @@ public class PublicationToOaf implements Serializable {
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
{
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
put(
ModelConstants.ORCID,
new Pair<>(ModelConstants.ORCID_DS, OPENAIRE_PREFIX + SEPARATOR + ModelConstants.ORCID));
}
};
@ -102,8 +103,6 @@ public class PublicationToOaf implements Serializable {
}
}
public static final String PID_TYPES = "dnet:pid_types";
public Oaf generatePublicationActionsFromJson(final String json) {
try {
if (parsedPublications != null) {
@ -138,8 +137,8 @@ public class PublicationToOaf implements Serializable {
mapQualifier(
"sysimport:actionset:orcidworks-no-doi",
"sysimport:actionset:orcidworks-no-doi",
"dnet:provenanceActions",
"dnet:provenanceActions"));
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS));
publication.setDataInfo(dataInfo);
publication.setLastupdatetimestamp(new Date().getTime());
@ -159,7 +158,9 @@ public class PublicationToOaf implements Serializable {
publication
.getExternalReference()
.add(
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
convertExtRef(
extId, classid, classname, ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES));
}
});
@ -505,24 +506,21 @@ public class PublicationToOaf implements Serializable {
private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue();
cf.setValue(ORCID);
cf.setValue(ModelConstants.ORCID_DS);
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
return cf;
}
private KeyValue createHostedBy() {
KeyValue hb = new KeyValue();
hb.setValue("Unknown Repository");
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
return hb;
return ModelConstants.UNKNOWN_REPOSITORY;
}
private StructuredProperty mapAuthorId(String orcidId) {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(orcidId);
final Qualifier q = new Qualifier();
q.setClassid(ORCID.toLowerCase());
q.setClassname(ORCID.toLowerCase());
q.setClassid(ModelConstants.ORCID);
q.setClassname(ModelConstants.ORCID_CLASSNAME);
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
q.setSchemename(ModelConstants.DNET_PID_TYPES);
sp.setQualifier(q);
@ -535,8 +533,8 @@ public class PublicationToOaf implements Serializable {
mapQualifier(
"sysimport:crosswalk:entityregistry",
"Harvested",
"dnet:provenanceActions",
"dnet:provenanceActions"));
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS));
sp.setDataInfo(dataInfo);
return sp;
}

View File

@ -1,6 +1,8 @@
package eu.dnetlib.doiboost.uw
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Instance, Publication}
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
@ -26,13 +28,16 @@ object UnpayWallToOAF {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val doi = (json \"doi").extract[String]
val doi = DoiBoostMappingUtil.normalizeDoi((json \"doi").extract[String])
//val normalized_doi = DoiBoostMappingUtil.normalizeDoi(doi)
if(doi == null)
return null
val is_oa = (json\ "is_oa").extract[Boolean]
val oaLocation:OALocation = (json \ "best_oa_location").extractOrElse[OALocation](null)
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
pub.setPid(List(createSP(doi, "doi", ModelConstants.DNET_PID_TYPES)).asJava)
pub.setId(generateIdentifier(pub, doi.toLowerCase))
pub.setCollectedfrom(List(createUnpayWallCollectedFrom()).asJava)

View File

@ -1,42 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
</configuration>

View File

@ -1,372 +0,0 @@
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorIntersectionMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<!-- Itersection Parameters -->
<property>
<name>workingPath</name>
<description>the working Path</description>
</property>
<property>
<name>hostedByMapPath</name>
<description>the hostedByMap Path</description>
</property>
<property>
<name>outputPath</name>
<description>the Path of the sequence file action set</description>
</property>
<!-- Crossref Parameters -->
<property>
<name>inputPathCrossref</name>
<description>the Crossref input path</description>
</property>
<property>
<name>crossrefTimestamp</name>
<description>Timestamp for the Crossref incremental Harvesting</description>
</property>
<property>
<name>esServer</name>
<description>elasticsearch server url for the Crossref Harvesting</description>
</property>
<property>
<name>esIndex</name>
<description>elasticsearch index name for the Crossref Harvesting</description>
</property>
<!-- MAG Parameters -->
<property>
<name>MAGDumpPath</name>
<description>the MAG dump working path</description>
</property>
<property>
<name>inputPathMAG</name>
<description>the MAG working path</description>
</property>
<!-- UnpayWall Parameters -->
<property>
<name>inputPathUnpayWall</name>
<description>the UnpayWall working path</description>
</property>
<!-- ORCID Parameters -->
<property>
<name>inputPathOrcid</name>
<description>the ORCID input path</description>
</property>
<property>
<name>workingPathOrcid</name>
<description>the ORCID working path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
<case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
<case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
<case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
<case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
<case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
<case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
<default to="ImportCrossRef"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ImportCrossRef">
<java>
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
<arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--esServer</arg><arg>${esServer}</arg>
<arg>--esIndex</arg><arg>${esIndex}</arg>
<arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
</java>
<ok to="GenerateCrossrefDataset"/>
<error to="Kill"/>
</action>
<!-- CROSSREF SECTION -->
<action name="GenerateCrossrefDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateCrossrefDataset</name>
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="RenameDataset"/>
<error to="Kill"/>
</action>
<action name="RenameDataset">
<fs>
<delete path="${inputPathCrossref}/crossref_ds"/>
<move source="${inputPathCrossref}/crossref_ds_updated"
target="${inputPathCrossref}/crossref_ds"/>
</fs>
<ok to="ResetMagWorkingPath"/>
<error to="Kill"/>
</action>
<!-- MAG SECTION -->
<action name="ResetMagWorkingPath">
<fs>
<delete path="${inputPathMAG}/dataset"/>
<delete path="${inputPathMAG}/process"/>
</fs>
<ok to="ConvertMagToDataset"/>
<error to="Kill"/>
</action>
<action name="ConvertMagToDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert Mag to Dataset</name>
<class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${MAGDumpPath}</arg>
<arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ConvertCrossrefToOAF"/>
<error to="Kill"/>
</action>
<action name="ConvertCrossrefToOAF">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ConvertCrossrefToOAF</name>
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessMAG"/>
<error to="Kill"/>
</action>
<action name="ProcessMAG">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert Mag to OAF Dataset</name>
<class>eu.dnetlib.doiboost.mag.SparkProcessMAG</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorIntersectionMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
<arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessUW"/>
<error to="Kill"/>
</action>
<!-- UnpayWall SECTION -->
<action name="ProcessUW">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert UnpayWall to Dataset</name>
<class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
<arg>--targetPath</arg><arg>${workingPath}/uwPublication</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessORCID"/>
<error to="Kill"/>
</action>
<!-- ORCID SECTION -->
<action name="ProcessORCID">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert ORCID to Dataset</name>
<class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
<arg>--workingPath</arg><arg>${workingPathOrcid}</arg>
<arg>--targetPath</arg><arg>${workingPath}/orcidPublication</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="CreateDOIBoost"/>
<error to="Kill"/>
</action>
<!-- INTERSECTION SECTION-->
<action name="CreateDOIBoost">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create DOIBoost Infospace</name>
<class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorIntersectionMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
<arg>--affiliationPath</arg><arg>${inputPathMAG}/dataset/Affiliations</arg>
<arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/dataset/PaperAuthorAffiliations</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="GenerateActionSet"/>
<error to="Kill"/>
</action>
<action name="GenerateActionSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Generate DOIBoost ActionSet</name>
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
<arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
<arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
<arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
<arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
<arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
<arg>--sFilePath</arg><arg>${outputPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.doiboost
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import org.junit.jupiter.api.Test
class NormalizeDOITest {
@Test
def doiDSLowerCase():Unit = {
val doi ="10.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals(doi.toLowerCase()))
}
@Test
def doiFiltered():Unit = {
val doi = "0.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
}
@Test
def doiFiltered2():Unit = {
val doi = "https://doi.org/0.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi) == null)
}
@Test
def doiCleaned():Unit = {
val doi = "https://doi.org/10.1042/BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase()))
}
@Test
def doiCleaned1():Unit = {
val doi = "https://doi.org/10.1042/ BCJ20160876"
assert(DoiBoostMappingUtil.normalizeDoi(doi).equals("10.1042/BCJ20160876".toLowerCase()))
}
}

View File

@ -367,6 +367,7 @@ class CrossrefMappingTest {
val relevantDates = result.getRelevantdate.asScala
assert(relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")), "Missing relevant date of type created")
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
@ -384,4 +385,36 @@ class CrossrefMappingTest {
}
@Test
def testNormalizeDOI(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString
val line :String = "\"funder\": [{\"name\": \"Wellcome Trust Masters Fellowship\",\"award\": [\"090633\"]}],"
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Publication])
val result: Result = items.head.asInstanceOf[Publication]
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1)
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
}
@Test
def testNormalizeDOI2(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article.json")).mkString
val resultList: List[Oaf] = Crossref2Oaf.convert(template)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Publication])
val result: Result = items.head.asInstanceOf[Publication]
result.getPid.asScala.foreach(pid => assertTrue(pid.getQualifier.getClassid.equals("doi")))
assertTrue(result.getPid.size() == 1)
result.getPid.asScala.foreach(pid => assertTrue(pid.getValue.equals("10.26850/1678-4618EQJ.v35.1.2010.p41-46".toLowerCase())))
}
}

View File

@ -4,7 +4,7 @@ import java.sql.Timestamp
import eu.dnetlib.dhp.schema.oaf.Publication
import org.apache.htrace.fasterxml.jackson.databind.SerializationFeature
import org.apache.spark.SparkConf
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
@ -63,6 +63,62 @@ class MAGMappingTest {
}
@Test
def normalizeDoiTest():Unit = {
import org.json4s.jackson.Serialization.write
import org.json4s.DefaultFormats
implicit val formats = DefaultFormats
// val serializedJSON:String = write(myObject)
val conf = new SparkConf().setAppName("test").setMaster("local[2]")
val sc = new SparkContext(conf)
val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
val path = getClass.getResource("magPapers.json").getPath
import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 10)
ret.take(10).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
spark.close()
}
@Test
def normalizeDoiTest2():Unit = {
import org.json4s.jackson.Serialization.write
import org.json4s.DefaultFormats
implicit val formats = DefaultFormats
// val serializedJSON:String = write(myObject)
val conf = new SparkConf().setAppName("test").setMaster("local[2]")
val sc = new SparkContext(conf)
val spark = SparkSession.builder.config(sc.getConf).getOrCreate()
val path = getClass.getResource("duplicatedMagPapers.json").getPath
import org.apache.spark.sql.Encoders
val schema = Encoders.product[MagPapers].schema
import spark.implicits._
val magPapers :Dataset[MagPapers] = spark.read.option("multiline",true).schema(schema).json(path).as[MagPapers]
val ret :Dataset[MagPapers] = SparkProcessMAG.getDistinctResults(magPapers)
assertTrue(ret.count == 8)
ret.take(8).foreach(mp => assertTrue(mp.Doi.equals(mp.Doi.toLowerCase())))
spark.close()
//ret.take(8).foreach(mp => println(write(mp)))
}
}

View File

@ -7,7 +7,8 @@ import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory}
import java.io._
import scala.collection.JavaConversions._
import scala.io.Source
class MappingORCIDToOAFTest {
@ -46,6 +47,25 @@ class MappingORCIDToOAFTest {
//
// }
@Test
def testExtractDat1():Unit ={
val aList: List[OrcidAuthor] = List(OrcidAuthor("0000-0002-4335-5309", Some("Lucrecia"), Some("Curto"), null, null, null ),
OrcidAuthor("0000-0001-7501-3330", Some("Emilio"), Some("Malchiodi"), null, null, null ), OrcidAuthor("0000-0002-5490-9186", Some("Sofia"), Some("Noli Truant"), null, null, null ))
val orcid:ORCIDItem = ORCIDItem("10.1042/BCJ20160876", aList)
val oaf = ORCIDToOAF.convertTOOAF(orcid)
assert(oaf.getPid.size() == 1)
oaf.getPid.toList.foreach(pid => pid.getQualifier.getClassid.equals("doi"))
oaf.getPid.toList.foreach(pid => pid.getValue.equals("10.1042/BCJ20160876".toLowerCase()))
println(mapper.writeValueAsString(ORCIDToOAF.convertTOOAF(orcid)))
}

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,5 @@
{
"DOI": "10.26850/1678-4618eqj.v35.1.2010.p41-46",
"DOI": " 10.26850/1678-4618eqj.v35.1.2010.p41-46",
"issued": {
"date-parts": [
[

View File

@ -1,5 +1,5 @@
{
"DOI": "10.26850/1678-4618eqj.v35.1.2010.p41-46",
"DOI": "10.26850/1678-4618EQJ.v35.1.2010.p41-46",
"issued": {
"date-parts": [
[

View File

@ -0,0 +1,10 @@
[{"PaperId":2866429360,"Rank":1,"Doi":"10.5465/AMBPP.2018.12619SYMPOSIUM","DocType":null,"PaperTitle":"new directions in research on conflict dynamics","OriginalTitle":"New Directions in Research on Conflict Dynamics","BookTitle":null,"Year":2018,"Date":"2018-07-09T00:00:00Z","Publisher":"Academy of Management Briarcliff Manor, NY 10510","JournalId":null,"Volume":"2018","Issue":"1","FirstPage":"12619","LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Academy of Management Proceedings","CreatedDate":"2018-07-19T00:00:00Z"},
{"PaperId":2871494677,"Rank":2,"Doi":"10.1007/978-981-10-8971-8_33","DocType":null,"PaperTitle":"wild flame detection using weight adaptive particle filter from monocular video","OriginalTitle":"Wild Flame Detection Using Weight Adaptive Particle Filter from Monocular Video","BookTitle":null,"Year":2019,"Date":"2019-01-01T00:00:00Z","Publisher":"Springer, Singapore","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"357","LastPage":"365","ReferenceCount":14,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-07-19T00:00:00Z"},
{"PaperId":2883520096,"Rank":3,"Doi":"10.5465/AMBPP .2018.12619SYMPOSIUM","DocType":"Journal","PaperTitle":"elaboracion de un corpus cacografico desde la disponibilidad lexica en estudiantes sevillanos un analisis para la ensenanza de la lengua","OriginalTitle":"Elaboración de un corpus cacográfico desde la disponibilidad léxica en estudiantes sevillanos. Un análisis para la enseñanza de la lengua","BookTitle":null,"Year":2018,"Date":"2018-07-13T00:00:00Z","Publisher":"Poli papers","JournalId":2738339871,"Volume":"13","Issue":"1","FirstPage":"119","LastPage":"131","ReferenceCount":28,"CitationCount":2,"EstimatedCitation":2,"OriginalVenue":"Revista de Lingüística y Lenguas Aplicadas","CreatedDate":"2018-08-03T00:00:00Z"},
{"PaperId":2883800636,"Rank":4,"Doi":"10.1007/978-3-319-92513-4_4","DocType":null,"PaperTitle":"cognitive advantage of bilingualism and its criticisms","OriginalTitle":"Cognitive Advantage of Bilingualism and Its Criticisms","BookTitle":null,"Year":2018,"Date":"2018-01-01T00:00:00Z","Publisher":"Springer, Cham","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"67","LastPage":"89","ReferenceCount":74,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-08-03T00:00:00Z"},
{"PaperId":2885023064,"Rank":5,"Doi":"10.1097/NNA.0000000000000647","DocType":"Journal","PaperTitle":"enhancing and advancing shared governance through a targeted decision making redesign","OriginalTitle":"Enhancing and Advancing Shared Governance Through a Targeted Decision-Making Redesign.","BookTitle":null,"Year":2018,"Date":"2018-09-01T00:00:00Z","Publisher":"J Nurs Adm","JournalId":194945867,"Volume":"48","Issue":"9","FirstPage":"445","LastPage":"451","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Journal of Nursing Administration","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2885607541,"Rank":1,"Doi":"10.1007/S10465-018-9283-7","DocType":"Journal","PaperTitle":"dance movement therapists attitudes and actions regarding lgbtqi and gender nonconforming communities","OriginalTitle":"Dance/Movement Therapists Attitudes and Actions Regarding LGBTQI and Gender Nonconforming Communities","BookTitle":null,"Year":2018,"Date":"2018-08-07T00:00:00Z","Publisher":"Springer US","JournalId":104993962,"Volume":"40","Issue":"2","FirstPage":"202","LastPage":"223","ReferenceCount":40,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"American Journal of Dance Therapy","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2886182429,"Rank":2,"Doi":"10.13039/501100003329","DocType":null,"PaperTitle":"caracteres de adaptacion en judia comun phaseolus vulgaris l aproximacion genetica e identificacion de qtls","OriginalTitle":"Caracteres de adaptación en judía común (Phaseolus vulgaris L.): aproximación genética e identificación de QTLs","BookTitle":null,"Year":2017,"Date":"2017-06-15T00:00:00Z","Publisher":"CSIC - Misión Biológica de Galicia (MBG)","JournalId":null,"Volume":null,"Issue":null,"FirstPage":null,"LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":null,"CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2887149460,"Rank":3,"Doi":"10.1093/FEMSLE/FNY192","DocType":"Journal","PaperTitle":"small extracellular particles with big potential for horizontal gene transfer membrane vesicles and gene transfer agents","OriginalTitle":"Small extracellular particles with big potential for horizontal gene transfer: membrane vesicles and gene transfer agents.","BookTitle":null,"Year":2018,"Date":"2018-10-01T00:00:00Z","Publisher":"Narnia","JournalId":34954451,"Volume":"365","Issue":"19","FirstPage":null,"LastPage":null,"ReferenceCount":124,"CitationCount":13,"EstimatedCitation":13,"OriginalVenue":"Fems Microbiology Letters","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2887446149,"Rank":4,"Doi":"10.5465/ambpp.2018.12619symposium","DocType":"Journal","PaperTitle":"notes from the field toxigenic vibrio cholerae o141 in a traveler to florida nebraska 2017","OriginalTitle":"Notes from the Field: Toxigenic Vibrio cholerae O141 in a Traveler to Florida — Nebraska, 2017","BookTitle":null,"Year":2018,"Date":"2018-08-03T00:00:00Z","Publisher":"Centers for Disease Control MMWR Office","JournalId":183158886,"Volume":"67","Issue":"30","FirstPage":"838","LastPage":"839","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Morbidity and Mortality Weekly Report","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2889180499,"Rank":5,"Doi":"10.1007/S10924-018-1299-Z","DocType":"Journal","PaperTitle":"hybrid adsorbent materials obtained by the combination of poly ethylene alt maleic anhydride with lignin and lignosulfonate","OriginalTitle":"Hybrid Adsorbent Materials Obtained by the Combination of Poly(ethylene-alt-maleic anhydride) with Lignin and Lignosulfonate","BookTitle":null,"Year":2018,"Date":"2018-08-30T00:00:00Z","Publisher":"Springer US","JournalId":193665811,"Volume":"26","Issue":"11","FirstPage":"4293","LastPage":"4302","ReferenceCount":29,"CitationCount":5,"EstimatedCitation":5,"OriginalVenue":"Journal of Polymers and The Environment","CreatedDate":"2018-09-07T00:00:00Z"}]

View File

@ -0,0 +1,10 @@
[{"PaperId":2866429360,"Rank":1,"Doi":"10.5465/AMBPP.2018.12619SYMPOSIUM","DocType":null,"PaperTitle":"new directions in research on conflict dynamics","OriginalTitle":"New Directions in Research on Conflict Dynamics","BookTitle":null,"Year":2018,"Date":"2018-07-09T00:00:00Z","Publisher":"Academy of Management Briarcliff Manor, NY 10510","JournalId":null,"Volume":"2018","Issue":"1","FirstPage":"12619","LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Academy of Management Proceedings","CreatedDate":"2018-07-19T00:00:00Z"},
{"PaperId":2871494677,"Rank":2,"Doi":"10.1007/978-981-10-8971-8_33","DocType":null,"PaperTitle":"wild flame detection using weight adaptive particle filter from monocular video","OriginalTitle":"Wild Flame Detection Using Weight Adaptive Particle Filter from Monocular Video","BookTitle":null,"Year":2019,"Date":"2019-01-01T00:00:00Z","Publisher":"Springer, Singapore","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"357","LastPage":"365","ReferenceCount":14,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-07-19T00:00:00Z"},
{"PaperId":2883520096,"Rank":3,"Doi":"10.4995/RLYLA.2018.9176","DocType":"Journal","PaperTitle":"elaboracion de un corpus cacografico desde la disponibilidad lexica en estudiantes sevillanos un analisis para la ensenanza de la lengua","OriginalTitle":"Elaboración de un corpus cacográfico desde la disponibilidad léxica en estudiantes sevillanos. Un análisis para la enseñanza de la lengua","BookTitle":null,"Year":2018,"Date":"2018-07-13T00:00:00Z","Publisher":"Poli papers","JournalId":2738339871,"Volume":"13","Issue":"1","FirstPage":"119","LastPage":"131","ReferenceCount":28,"CitationCount":2,"EstimatedCitation":2,"OriginalVenue":"Revista de Lingüística y Lenguas Aplicadas","CreatedDate":"2018-08-03T00:00:00Z"},
{"PaperId":2883800636,"Rank":4,"Doi":"10.1007/978-3-319-92513-4_4","DocType":null,"PaperTitle":"cognitive advantage of bilingualism and its criticisms","OriginalTitle":"Cognitive Advantage of Bilingualism and Its Criticisms","BookTitle":null,"Year":2018,"Date":"2018-01-01T00:00:00Z","Publisher":"Springer, Cham","JournalId":null,"Volume":null,"Issue":null,"FirstPage":"67","LastPage":"89","ReferenceCount":74,"CitationCount":1,"EstimatedCitation":1,"OriginalVenue":null,"CreatedDate":"2018-08-03T00:00:00Z"},
{"PaperId":2885023064,"Rank":5,"Doi":"10.1097/NNA.0000000000000647","DocType":"Journal","PaperTitle":"enhancing and advancing shared governance through a targeted decision making redesign","OriginalTitle":"Enhancing and Advancing Shared Governance Through a Targeted Decision-Making Redesign.","BookTitle":null,"Year":2018,"Date":"2018-09-01T00:00:00Z","Publisher":"J Nurs Adm","JournalId":194945867,"Volume":"48","Issue":"9","FirstPage":"445","LastPage":"451","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Journal of Nursing Administration","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2885607541,"Rank":1,"Doi":"10.1007/S10465-018-9283-7","DocType":"Journal","PaperTitle":"dance movement therapists attitudes and actions regarding lgbtqi and gender nonconforming communities","OriginalTitle":"Dance/Movement Therapists Attitudes and Actions Regarding LGBTQI and Gender Nonconforming Communities","BookTitle":null,"Year":2018,"Date":"2018-08-07T00:00:00Z","Publisher":"Springer US","JournalId":104993962,"Volume":"40","Issue":"2","FirstPage":"202","LastPage":"223","ReferenceCount":40,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"American Journal of Dance Therapy","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2886182429,"Rank":2,"Doi":"10.13039/501100003329","DocType":null,"PaperTitle":"caracteres de adaptacion en judia comun phaseolus vulgaris l aproximacion genetica e identificacion de qtls","OriginalTitle":"Caracteres de adaptación en judía común (Phaseolus vulgaris L.): aproximación genética e identificación de QTLs","BookTitle":null,"Year":2017,"Date":"2017-06-15T00:00:00Z","Publisher":"CSIC - Misión Biológica de Galicia (MBG)","JournalId":null,"Volume":null,"Issue":null,"FirstPage":null,"LastPage":null,"ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":null,"CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2887149460,"Rank":3,"Doi":"10.1093/FEMSLE/FNY192","DocType":"Journal","PaperTitle":"small extracellular particles with big potential for horizontal gene transfer membrane vesicles and gene transfer agents","OriginalTitle":"Small extracellular particles with big potential for horizontal gene transfer: membrane vesicles and gene transfer agents.","BookTitle":null,"Year":2018,"Date":"2018-10-01T00:00:00Z","Publisher":"Narnia","JournalId":34954451,"Volume":"365","Issue":"19","FirstPage":null,"LastPage":null,"ReferenceCount":124,"CitationCount":13,"EstimatedCitation":13,"OriginalVenue":"Fems Microbiology Letters","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2887446149,"Rank":4,"Doi":"10.15585/MMWR.MM6730A7","DocType":"Journal","PaperTitle":"notes from the field toxigenic vibrio cholerae o141 in a traveler to florida nebraska 2017","OriginalTitle":"Notes from the Field: Toxigenic Vibrio cholerae O141 in a Traveler to Florida — Nebraska, 2017","BookTitle":null,"Year":2018,"Date":"2018-08-03T00:00:00Z","Publisher":"Centers for Disease Control MMWR Office","JournalId":183158886,"Volume":"67","Issue":"30","FirstPage":"838","LastPage":"839","ReferenceCount":0,"CitationCount":0,"EstimatedCitation":0,"OriginalVenue":"Morbidity and Mortality Weekly Report","CreatedDate":"2018-08-22T00:00:00Z"},
{"PaperId":2889180499,"Rank":5,"Doi":"10.1007/S10924-018-1299-Z","DocType":"Journal","PaperTitle":"hybrid adsorbent materials obtained by the combination of poly ethylene alt maleic anhydride with lignin and lignosulfonate","OriginalTitle":"Hybrid Adsorbent Materials Obtained by the Combination of Poly(ethylene-alt-maleic anhydride) with Lignin and Lignosulfonate","BookTitle":null,"Year":2018,"Date":"2018-08-30T00:00:00Z","Publisher":"Springer US","JournalId":193665811,"Volume":"26","Issue":"11","FirstPage":"4293","LastPage":"4302","ReferenceCount":29,"CitationCount":5,"EstimatedCitation":5,"OriginalVenue":"Journal of Polymers and The Environment","CreatedDate":"2018-09-07T00:00:00Z"}]

View File

@ -93,9 +93,9 @@ public class PrepareResultInstRepoAssociation {
SparkSession spark, String datasourceOrganizationPath, List<String> blacklist) {
String blacklisted = "";
if (blacklist.size() > 0) {
blacklisted = " AND d.id != '" + blacklist.get(0) + "'";
blacklisted = " AND id != '" + blacklist.get(0) + "'";
for (int i = 1; i < blacklist.size(); i++) {
blacklisted += " AND d.id != '" + blacklist.get(i) + "'";
blacklisted += " AND id != '" + blacklist.get(i) + "'";
}
}

View File

@ -24,8 +24,6 @@ public class Constants {
public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative";
public static String ORCID = "orcid";
static {
accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec");

View File

@ -503,7 +503,7 @@ public class ResultMapper implements Serializable {
private static Pid getOrcid(List<StructuredProperty> p) {
for (StructuredProperty pid : p) {
if (pid.getQualifier().getClassid().equals(Constants.ORCID)) {
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
if (di.isPresent()) {
return Pid

View File

@ -68,7 +68,7 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
protected static final String DATACITE_SCHEMA_KERNEL_3_SLASH = "http://datacite.org/schema/kernel-3/";
protected static final Qualifier ORCID_PID_TYPE = qualifier(
"ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES);
ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME, DNET_PID_TYPES, DNET_PID_TYPES);
protected static final Qualifier MAG_PID_TYPE = qualifier(
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);

View File

@ -19,6 +19,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
@ -61,7 +62,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
author.setPid(new ArrayList<>());
if (StringUtils.isNotBlank(pid)) {
if (type.startsWith("ORCID")) {
if (type.toLowerCase().startsWith(ORCID)) {
final String cleanedId = pid
.replaceAll("http://orcid.org/", "")
.replaceAll("https://orcid.org/", "");

View File

@ -20,6 +20,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
@ -98,7 +99,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
.replaceAll(" ", "")
.replaceAll("_", "");
if (type.startsWith("ORCID")) {
if (type.toLowerCase().startsWith(ModelConstants.ORCID)) {
final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", "");
res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
} else if (type.startsWith("MAGID")) {

View File

@ -16,6 +16,14 @@
<name>monitor_db_production_name</name>
<description>the name of the monitor public database</description>
</property>
<property>
<name>observatory_db_name</name>
<description>the monitor database name</description>
</property>
<property>
<name>observatory_db_production_name</name>
<description>the name of the monitor public database</description>
</property>
<property>
<name>stats_tool_api_url</name>
<description>The url of the API of the stats tool. Is used to trigger the cache promote.</description>
@ -77,6 +85,19 @@
<argument>${monitor_db_production_name}</argument>
<file>updateProductionViews.sh</file>
</shell>
<ok to="updateObservatoryViews"/>
<error to="Kill"/>
</action>
<action name="updateObservatoryViews">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>updateProductionViews.sh</exec>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_production_name}</argument>
<file>updateProductionViews.sh</file>
</shell>
<ok to="promoteCache"/>
<error to="Kill"/>
</action>

View File

@ -0,0 +1,28 @@
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
if ! [ -L $link_folder ]
then
rm -Rf "$link_folder"
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export SOURCE=$1
export TARGET=$2
export SHADOW=$3
export SCRIPT_PATH=$4
echo "Getting file from " $4
hdfs dfs -copyToLocal $4
echo "Creating observatory database"
impala-shell -q "drop database if exists ${TARGET} cascade"
impala-shell -q "create database if not exists ${TARGET}"
impala-shell -d ${SOURCE} -q "show tables" --delimited | sed "s/\(.*\)/create view ${TARGET}.\1 as select * from ${SOURCE}.\1;/" | impala-shell -f -
cat step21-createObservatoryDB.sql | sed s/SOURCE/$1/g | sed s/TARGET/$2/g1 | impala-shell -f -
echo "Impala shell finished"
echo "Updating shadow observatory database"
impala-shell -q "create database if not exists ${SHADOW}"
impala-shell -d ${SHADOW} -q "show tables" --delimited | sed "s/^/drop view if exists ${SHADOW}./" | sed "s/$/;/" | impala-shell -f -
impala-shell -d ${TARGET} -q "show tables" --delimited | sed "s/\(.*\)/create view ${SHADOW}.\1 as select * from ${TARGET}.\1;/" | impala-shell -f -
echo "Shadow db ready!"

View File

@ -45,35 +45,3 @@ FROM ${stats_db_name}.dataset
UNION ALL
SELECT *, bestlicence AS access_mode
FROM ${stats_db_name}.otherresearchproduct;
-------------------------------------------------------------------------------
-- To see with Antonis if the following is needed and where it should be placed
-------------------------------------------------------------------------------
CREATE TABLE ${stats_db_name}.numbers_country AS
SELECT org.country AS country, count(distinct rd.datasource) AS datasources, count(distinct r.id) AS publications
FROM ${stats_db_name}.result r,
${stats_db_name}.result_datasources rd,
${stats_db_name}.datasource d,
${stats_db_name}.datasource_organizations dor,
${stats_db_name}.organization org
WHERE r.id = rd.id
AND rd.datasource = d.id
AND d.id = dor.id
AND dor.organization = org.id
AND r.type = 'publication'
AND r.bestlicence = 'Open Access'
GROUP BY org.country;
-- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.datasource COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.publication COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.dataset COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.software COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.otherresearchproduct COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.numbers_country COMPUTE STATISTICS FOR COLUMNS;

View File

@ -59,33 +59,4 @@ from result_gold
union all
select distinct r.id, false as gold
from ${stats_db_name}.result r
where r.id not in (select id from result_gold);
-- shortcut result-country through the organization affiliation
create table ${stats_db_name}.result_affiliated_country as
select r.id as id, o.country as country
from ${stats_db_name}.result r
join ${stats_db_name}.result_organization ro on ro.id=r.id
join ${stats_db_name}.organization o on o.id=ro.organization
where o.country is not null and o.country!='';
-- shortcut result-country through datasource of deposition
create table ${stats_db_name}.result_deposited_country as
select r.id as id, o.country as country
from ${stats_db_name}.result r
join ${stats_db_name}.result_datasources rd on rd.id=r.id
join ${stats_db_name}.datasource d on d.id=rd.datasource
join ${stats_db_name}.datasource_organizations dor on dor.id=d.id
join ${stats_db_name}.organization o on o.id=dor.organization
where o.country is not null and o.country!='';
-- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.result_peerreviewed COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.result_greenoa COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.result_gold COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.result_affiliated_country COMPUTE STATISTICS FOR COLUMNS;
-- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.result_deposited_country COMPUTE STATISTICS FOR COLUMNS;
where r.id not in (select id from result_gold);

View File

@ -52,7 +52,4 @@ LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
drop table if exists ${stats_db_name}.result;
drop view if exists ${stats_db_name}.result;
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
drop table ${stats_db_name}.result_tmp;
--
-- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS;
-- ANALYZE TABLE ${stats_db_name}.result COMPUTE STATISTICS FOR COLUMNS;
drop table ${stats_db_name}.result_tmp;

View File

@ -19,9 +19,6 @@ create table TARGET.result as
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) ) foo;
compute stats TARGET.result;
create table TARGET.result_affiliated_country as select * from SOURCE.result_affiliated_country rac where exists (select 1 from TARGET.result r where r.id=rac.id);
compute stats TARGET.result_affiliated_country;
create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations;
@ -34,9 +31,6 @@ compute stats TARGET.result_concepts;
create table TARGET.result_datasources as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_datasources;
create table TARGET.result_deposited_country as select * from SOURCE.result_deposited_country orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_deposited_country;
create table TARGET.result_fundercount as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_fundercount;

View File

@ -0,0 +1,259 @@
create table TARGET.result_affiliated_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name;
create table TARGET.result_affiliated_year stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year;
create table TARGET.result_affiliated_year_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name;
create table TARGET.result_affiliated_datasource stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, d.name as dname
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_datasources rd on rd.id=r.id
left outer join SOURCE.datasource d on d.id=rd.datasource
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name;
create table TARGET.result_affiliated_datasource_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_datasources rd on rd.id=r.id
left outer join SOURCE.datasource d on d.id=rd.datasource
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name;
create table TARGET.result_affiliated_organization stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, o.name as oname
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name;
create table TARGET.result_affiliated_organization_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name;
create table TARGET.result_affiliated_funder stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, p.funder as pfunder
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder;
create table TARGET.result_affiliated_funder_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_organization ro on ro.id=r.id
join SOURCE.organization o on o.id=ro.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name;
create table TARGET.result_deposited_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, c.code, c.name;
create table TARGET.result_deposited_year stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, r.year
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year;
create table TARGET.result_deposited_year_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, r.year, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, r.year, c.code, c.name;
create table TARGET.result_deposited_datasource stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, d.name as dname
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name;
create table TARGET.result_deposited_datasource_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, d.name as dname, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, d.name, c.code, c.name;
create table TARGET.result_deposited_organization stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa, r.peer_reviewed, r.type, o.name as oname
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name;
create table TARGET.result_deposited_organization_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, o.name as oname, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, o.name, c.code, c.name;
create table TARGET.result_deposited_funder stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, p.funder as pfunder
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder;
create table TARGET.result_deposited_funder_country stored as parquet as
select count(distinct r.id) as total, r.green, r.gold, case when rl.type is not null then true else false end as licence,
case when pids.pid is not null then true else false end as pid, case when r.access_mode in ('Open Access', 'Open Source') then true else false end as oa,
r.peer_reviewed, r.type, p.funder as pfunder, c.code as ccode, c.name as cname
from SOURCE.result r
join SOURCE.result_datasources rd on rd.id=r.id
join SOURCE.datasource d on d.id=rd.datasource and d.type in ('Institutional Repository','Data Repository', 'Repository', 'Publication Repository')
join SOURCE.datasource_organizations dor on dor.id=d.id
join SOURCE.organization o on o.id=dor.organization
join SOURCE.country c on c.code=o.country and c.continent_name='Europe'
join SOURCE.result_projects rp on rp.id=r.id
join SOURCE.project p on p.id=rp.project
left outer join SOURCE.result_licenses rl on rl.id=r.id
left outer join SOURCE.result_pids pids on pids.id=r.id
group by r.green, r.gold, licence, pid, oa, r.peer_reviewed, r.type, p.funder, c.code, c.name;
compute stats TARGET.result_affiliated_country;
compute stats TARGET.result_affiliated_year;
compute stats TARGET.result_affiliated_year_country;
compute stats TARGET.result_affiliated_datasource;
compute stats TARGET.result_affiliated_datasource_country;
compute stats TARGET.result_affiliated_organization;
compute stats TARGET.result_affiliated_organization_country;
compute stats TARGET.result_affiliated_funder;
compute stats TARGET.result_affiliated_funder_country;
compute stats TARGET.result_deposited_country;
compute stats TARGET.result_deposited_year;
compute stats TARGET.result_deposited_year_country;
compute stats TARGET.result_deposited_datasource;
compute stats TARGET.result_deposited_datasource_country;
compute stats TARGET.result_deposited_organization;
compute stats TARGET.result_deposited_organization_country;
compute stats TARGET.result_deposited_funder;
compute stats TARGET.result_deposited_funder_country;

View File

@ -25,6 +25,14 @@
<name>monitor_db_shadow_name</name>
<description>the name of the shadow monitor db</description>
</property>
<property>
<name>observatory_db_name</name>
<description>the target monitor db name</description>
</property>
<property>
<name>observatory_db_shadow_name</name>
<description>the name of the shadow monitor db</description>
</property>
<property>
<name>stats_tool_api_url</name>
<description>The url of the API of the stats tool. Is used to trigger the cache update.</description>
@ -305,11 +313,26 @@
<argument>${wf:appPath()}/scripts/step20-createMonitorDB.sql</argument>
<file>monitor.sh</file>
</shell>
<ok to="Step21"/>
<ok to="step21-createObservatoryDB"/>
<error to="Kill"/>
</action>
<action name="Step21">
<action name="step21-createObservatoryDB">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>observatory.sh</exec>
<argument>${stats_db_name}</argument>
<argument>${observatory_db_name}</argument>
<argument>${observatory_db_shadow_name}</argument>
<argument>${wf:appPath()}/scripts/step21-createObservatoryDB.sql</argument>
<file>observatory.sh</file>
</shell>
<ok to="Step22"/>
<error to="Kill"/>
</action>
<action name="Step22">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
@ -322,4 +345,4 @@
</action>
<end name="End"/>
</workflow-app>
</workflow-app>