From 4bb3bcafa544cd5a4806c610d6346ff55e4356da Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Thu, 11 Mar 2021 11:32:32 +0100 Subject: [PATCH] add author sequence number --- .../java/eu/dnetlib/dhp/schema/oaf/Author.java | 1 + .../dnetlib/doiboost/crossref/Crossref2Oaf.scala | 12 +++++++++--- .../eu/dnetlib/doiboost/mag/MagDataModel.scala | 10 +++++----- .../dnetlib/doiboost/mag/SparkProcessMAG.scala | 4 ++-- .../eu/dnetlib/doiboost/crossref/article.json | 16 ++++++++-------- 5 files changed, 25 insertions(+), 18 deletions(-) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java index 231fb1e606..b2f757d71a 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java @@ -12,6 +12,7 @@ public class Author implements Serializable { private String surname; + // START WITH 1 private Integer rank; private List pid; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 9251bba0e7..79194b283d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -21,7 +21,7 @@ case class CrossrefDT(doi: String, json:String, timestamp: Long) {} case class mappingAffiliation(name: String) {} -case class mappingAuthor(given: Option[String], family: String, ORCID: Option[String], affiliation: Option[mappingAffiliation]) {} +case class mappingAuthor(given: Option[String], family: String, sequence:Option[String], ORCID: Option[String], affiliation: Option[mappingAffiliation]) {} case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {} @@ -162,7 +162,12 @@ case object Crossref2Oaf { //Mapping Author val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List()) - result.setAuthor(authorList.map(a => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull)).asJava) + + + + val sorted_list = authorList.sortWith((a:mappingAuthor, b:mappingAuthor) => a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first")) + + result.setAuthor(sorted_list.zipWithIndex.map{case (a, index) => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull, index)}.asJava) // Mapping instance val instance = new Instance() @@ -205,11 +210,12 @@ case object Crossref2Oaf { } - def generateAuhtor(given: String, family: String, orcid: String): Author = { + def generateAuhtor(given: String, family: String, orcid: String, index:Int): Author = { val a = new Author a.setName(given) a.setSurname(family) a.setFullname(s"$given $family") + a.setRank(index+1) if (StringUtils.isNotBlank(orcid)) a.setPid(List(createSP(orcid, ORCID_PENDING, PID_TYPES, generateDataInfo())).asJava) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index 910fad0e20..987a81fba8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -32,11 +32,11 @@ case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {} -case class MagAuthorAffiliation(author: MagAuthor, affiliation:String) +case class MagAuthorAffiliation(author: MagAuthor, affiliation:String, sequenceNumber:Int) case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {} -case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String) {} +case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String, sequenceNumber:Int) {} case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {} @@ -209,9 +209,9 @@ case object ConversionUtil { val authorsOAF = authors.authors.map { f: MagAuthorAffiliation => val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author - - a.setFullname(f.author.DisplayName.get) - + a.setRank(f.sequenceNumber) + if (f.author.DisplayName.isDefined) + a.setFullname(f.author.DisplayName.get) if(f.affiliation!= null) a.setAffiliation(List(asField(f.affiliation)).asJava) a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index 780e65c1e9..bc1982e778 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -58,13 +58,13 @@ object SparkProcessMAG { val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation] paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId"))) - .map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null)) } + .map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null, a.AuthorSequenceNumber)) } .joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left") .map(s => { val mpa = s._1._2 val af = s._2 if (af != null) { - MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName) + MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName, mpa.sequenceNumber) } else mpa }).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors")) diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article.json index e0dc0db39d..69424d0ad7 100644 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article.json +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/article.json @@ -12,14 +12,6 @@ "abstract": "A qualitative spot-test and tandem quantitative analysis of dipyrone in the bulk drugand in pharmaceutical preparations is proposed. The formation of a reddish-violet\u00a0 color indicates a positive result. In sequence a quantitative procedure can be performed in the same flask. The quantitative results obtained were statistically compared with those obtained with the method indicated by the Brazilian\u00a0 Pharmacopoeia, using the Student\u2019s t and the F tests. Considering the concentration in a 100 \u03bcL aliquot, the qualitative visual limit of detection is about 5\u00d710-6 g; instrumental LOD \u2245 1.4\u00d710-4 mol L-1 ; LOQ \u2245 4.5\u00d710-4 mol L-1.", "prefix": "10.26850", "author": [ - { - "authenticated-orcid": false, - "given": "Matthieu", - "family": "Tubino", - "sequence": "first", - "affiliation": [], - "ORCID": "http://orcid.org/0000-0002-1987-3907" - }, { "affiliation": [], "given": "A. C.", @@ -49,6 +41,14 @@ "sequence": "additional", "affiliation": [], "ORCID": "http://orcid.org/0000-0001-5564-1639" + }, + { + "authenticated-orcid": false, + "given": "Matthieu", + "family": "Tubino", + "sequence": "first", + "affiliation": [], + "ORCID": "http://orcid.org/0000-0002-1987-3907" } ], "reference-count": 0,