merge with beta - resolved conflict in pom

2021-11-12 10:19:59 +01:00 · 2021-11-12 10:19:59 +01:00 · ffb0ce1d59
parent 716021546e 1f2a3d1af0
commit ffb0ce1d59
16 changed files with 1025 additions and 38 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala
@ -70,7 +70,7 @@ case object Crossref2Oaf {
    "reference-book" -> "0002 Book",
    "monograph" -> "0002 Book",
    "journal-article" -> "0001 Article",
-    "dissertation" -> "0006 Doctoral thesis",
+    "dissertation" -> "0044 Thesis",
    "other" -> "0038 Other literature type",
    "peer-review" -> "0015 Review",
    "proceedings" -> "0004 Conference object",
@ -206,11 +206,16 @@ case object Crossref2Oaf {
    else {
      instance.setDateofacceptance(asField(createdDate.getValue))
    }
-    val s: String = (json \ "URL").extract[String]
+    val s: List[String] = List("https://doi.org/" + doi)
-    val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null).distinct
+//    val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct
-    if (links.nonEmpty) {
+//    if (links.nonEmpty) {
-      instance.setUrl(links.asJava)
+//      instance.setUrl(links.asJava)
 //    }
    if(s.nonEmpty)
      {
        instance.setUrl(s.asJava)
      }
    result.setInstance(List(instance).asJava)
    //IMPORTANT
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala
@ -111,26 +111,9 @@ object SparkProcessMAG {
      .map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
      .write
      .mode(SaveMode.Overwrite)
      .save(s"$workingPath/merge_step_2_conference")
    magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
      .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
    val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
    logger.info("Phase 5) enrich publication with URL and Instances")
    magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
      .map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
      .write.mode(SaveMode.Overwrite)
      .save(s"$workingPath/merge_step_3")
 //    logger.info("Phase 6) Enrich Publication with description")
 //    val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
 //    pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
    val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
@ -162,12 +145,14 @@ object SparkProcessMAG {
      .write.mode(SaveMode.Overwrite)
      .save(s"$workingPath/mag_publication")
-
+    spark.read.load(s"$workingPath/mag_publication").as[Publication]
-    val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
+      .filter(p => p.getId == null)
-      .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
+      .groupByKey(p => p.getId)
      .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
      .map(_._2)
      .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
    spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
  }
 }
--- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala
+++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala
@ -612,4 +612,26 @@ class CrossrefMappingTest {
  }
  @Test
  def testMultipleURLs() :Unit = {
    val json = Source.fromInputStream(getClass.getResourceAsStream("multiple_urls.json")).mkString
    assertNotNull(json)
    assertFalse(json.isEmpty);
    val resultList: List[Oaf] = Crossref2Oaf.convert(json)
    assertTrue(resultList.nonEmpty)
    val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
    assertEquals(1, item.getInstance().size())
    assertEquals(1, item.getInstance().get(0).getUrl().size())
    assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0))
    //println(mapper.writeValueAsString(item))
  }
 }
--- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json
+++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json
@ -0,0 +1,614 @@
 {
 "indexed": {
 "date-parts": [
 [
 2021,
 10,
 31
 ]
 ],
 "date-time": "2021-10-31T15:48:01Z",
 "timestamp": 1635695281393
 },
 "reference-count": 39,
 "publisher": "Elsevier BV",
 "license": [
 {
 "start": {
 "date-parts": [
 [
 2019,
 12,
 1
 ]
 ],
 "date-time": "2019-12-01T00:00:00Z",
 "timestamp": 1575158400000
 },
 "content-version": "tdm",
 "delay-in-days": 0,
 "URL": "https://www.elsevier.com/tdm/userlicense/1.0/"
 },
 {
 "start": {
 "date-parts": [
 [
 2019,
 9,
 13
 ]
 ],
 "date-time": "2019-09-13T00:00:00Z",
 "timestamp": 1568332800000
 },
 "content-version": "vor",
 "delay-in-days": 0,
 "URL": "http://creativecommons.org/licenses/by/4.0/"
 }
 ],
 "funder": [
 {
 "DOI": "10.13039/100001182",
 "name": "INSTAP",
 "doi-asserted-by": "publisher"
 },
 {
 "DOI": "10.13039/100014440",
 "name": "Ministry of Science, Innovation and Universities",
 "doi-asserted-by": "publisher",
 "award": [
 "RYC-2016-19637"
 ]
 },
 {
 "DOI": "10.13039/100010661",
 "name": "European Union’s Horizon 2020",
 "doi-asserted-by": "publisher",
 "award": [
 "746446"
 ]
 }
 ],
 "content-domain": {
 "domain": [
 "elsevier.com",
 "sciencedirect.com"
 ],
 "crossmark-restriction": true
 },
 "short-container-title": [
 "Journal of Archaeological Science"
 ],
 "published-print": {
 "date-parts": [
 [
 2019,
 12
 ]
 ]
 },
 "DOI": "10.1016/j.jas.2019.105013",
 "type": "journal-article",
 "created": {
 "date-parts": [
 [
 2019,
 9,
 25
 ]
 ],
 "date-time": "2019-09-25T20:05:08Z",
 "timestamp": 1569441908000
 },
 "page": "105013",
 "update-policy": "http://dx.doi.org/10.1016/elsevier_cm_policy",
 "source": "Crossref",
 "is-referenced-by-count": 21,
 "title": [
 "A brave new world for archaeological survey: Automated machine learning-based potsherd detection using high-resolution drone imagery"
 ],
 "prefix": "10.1016",
 "volume": "112",
 "author": [
 {
 "given": "H.A.",
 "family": "Orengo",
 "sequence": "first",
 "affiliation": [
 ]
 },
 {
 "given": "A.",
 "family": "Garcia-Molsosa",
 "sequence": "additional",
 "affiliation": [
 ]
 }
 ],
 "member": "78",
 "reference": [
 {
 "key": "10.1016/j.jas.2019.105013_bib1",
 "doi-asserted-by": "crossref",
 "first-page": "85",
 "DOI": "10.1080/17538947.2016.1250829",
 "article-title": "Remote sensing heritage in a petabyte-scale: satellite data and heritage Earth Engine© applications",
 "volume": "10",
 "author": "Agapiou",
 "year": "2017",
 "journal-title": "Int. J. Digit. Earth"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib2",
 "series-title": "Extracting Meaning from Ploughsoil Assemblages",
 "first-page": "1",
 "article-title": "Extracting meaning from ploughsoil assemblages: assessments of the past, strategies for the future",
 "author": "Alcock",
 "year": "2000"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib3",
 "series-title": "Side-by-Side Survey. Comparative Regional Studies in the Mediterranean World",
 "first-page": "1",
 "article-title": "Introduction",
 "author": "Alcock",
 "year": "2004"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib4",
 "doi-asserted-by": "crossref",
 "first-page": "93",
 "DOI": "10.1111/j.1538-4632.1995.tb00338.x",
 "article-title": "Local indicators of spatial association—LISA",
 "volume": "27",
 "author": "Anselin",
 "year": "1995",
 "journal-title": "Geogr. Anal."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib5",
 "series-title": "Archaeological Survey",
 "author": "Banning",
 "year": "2002"
 },
 {
 "issue": "1/2",
 "key": "10.1016/j.jas.2019.105013_bib6",
 "doi-asserted-by": "crossref",
 "first-page": "123",
 "DOI": "10.2307/3181488",
 "article-title": "GIS, archaeological survey and landscape archaeology on the island of Kythera, Greece",
 "volume": "29",
 "author": "Bevan",
 "year": "2004",
 "journal-title": "J. Field Archaeol."
 },
 {
 "issue": "1",
 "key": "10.1016/j.jas.2019.105013_bib8",
 "doi-asserted-by": "crossref",
 "first-page": "5",
 "DOI": "10.1023/A:1010933404324",
 "article-title": "Random forests",
 "volume": "45",
 "author": "Breiman",
 "year": "2001",
 "journal-title": "Mach. Learn."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib9",
 "series-title": "Sampling in Contemporary British Archaeology",
 "author": "Cherry",
 "year": "1978"
 },
 {
 "issue": "3",
 "key": "10.1016/j.jas.2019.105013_bib10",
 "doi-asserted-by": "crossref",
 "first-page": "273",
 "DOI": "10.1016/0734-189X(84)90197-X",
 "article-title": "Segmentation of a high-resolution urban scene using texture operators",
 "volume": "25",
 "author": "Conners",
 "year": "1984",
 "journal-title": "Comput. Vis. Graph Image Process"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib11",
 "first-page": "31",
 "article-title": "Old land surfaces and modern ploughsoil: implications of recent work at Maxey, Cambridgeshire",
 "volume": "2",
 "author": "Crowther",
 "year": "1983",
 "journal-title": "Scott. Archaeol. Rev."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib12",
 "series-title": "Settlement Pattern Studies in the Americas: Fifty Years since Virú",
 "first-page": "203",
 "article-title": "Conclusions: the settlement pattern concept from an Americanist perspective",
 "author": "Fish",
 "year": "1999"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib13",
 "doi-asserted-by": "crossref",
 "first-page": "21",
 "DOI": "10.3390/geosciences9010021",
 "article-title": "Remote sensing and historical morphodynamics of alluvial plains. The 1909 indus flood and the city of Dera Gazhi Khan (province of Punjab, Pakistan)",
 "volume": "9",
 "author": "Garcia",
 "year": "2019",
 "journal-title": "Geosciences"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib14",
 "unstructured": "Georgiadis, M.; Garcia-Molsosa, A.; Orengo, H.A.; Kefalidou, E. and Kallintzi, K. In Preparation. APAX Project 2015-2018: A Preliminary Report. (Hesperia)."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib15",
 "series-title": "Geographical Information Systems and Landscape Archaeology",
 "first-page": "35",
 "article-title": "Regional survey and GIS: the boeotia project",
 "author": "Gillings",
 "year": "1999"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib16",
 "doi-asserted-by": "crossref",
 "first-page": "18",
 "DOI": "10.1016/j.rse.2017.06.031",
 "article-title": "Google Earth engine: planetary-scale geospatial analysis for everyone",
 "volume": "202",
 "author": "Gorelick",
 "year": "2017",
 "journal-title": "Remote Sens. Environ."
 },
 {
 "issue": "107",
 "key": "10.1016/j.jas.2019.105013_bib17",
 "doi-asserted-by": "crossref",
 "first-page": "177",
 "DOI": "10.1111/j.0031-868X.2004.00278.x",
 "article-title": "Photogrammetric reconstruction of the great buddha of Bamiyan, Afghanistan",
 "volume": "19",
 "author": "Grün",
 "year": "2004",
 "journal-title": "Photogramm. Rec."
 },
 {
 "issue": "6",
 "key": "10.1016/j.jas.2019.105013_bib18",
 "doi-asserted-by": "crossref",
 "first-page": "610",
 "DOI": "10.1109/TSMC.1973.4309314",
 "article-title": "Textural features for image classification",
 "author": "Haralick",
 "year": "1973",
 "journal-title": "IEEE Trans. Syst., Man, Cybernet., SMC-3"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib19",
 "doi-asserted-by": "crossref",
 "first-page": "76",
 "DOI": "10.1558/jmea.v14i1.76",
 "article-title": "Excavating to excess? Implications of the last decade of archaeology in Israel",
 "volume": "14",
 "author": "Kletter",
 "year": "2001",
 "journal-title": "J. Mediterr. Archaeol."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib20",
 "first-page": "299",
 "article-title": "Testing Google Earth Engine for the automatic identification and vectorization of archaeological features: a case study from Faynan, Jordan",
 "volume": "15",
 "author": "Liss",
 "year": "2017",
 "journal-title": "J. Archaeol. Sci.: Report"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib21",
 "series-title": "Geographical Information Systems and Landscape Archaeology",
 "first-page": "55",
 "article-title": "Towards a methodology for modelling surface survey data: the sangro valley project",
 "author": "Lock",
 "year": "1999"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib22",
 "series-title": "Extracting Meaning from Ploughsoil Assemblages",
 "first-page": "5",
 "article-title": "Methods of collection recording and quantification",
 "author": "Mattingly",
 "year": "2000"
 },
 {
 "issue": "14",
 "key": "10.1016/j.jas.2019.105013_bib23",
 "doi-asserted-by": "crossref",
 "first-page": "E778",
 "DOI": "10.1073/pnas.1115472109",
 "article-title": "Mapping patterns of long-term settlement in Northern Mesopotamia at a large scale",
 "volume": "109",
 "author": "Menze",
 "year": "2012",
 "journal-title": "Proc. Natl. Acad. Sci."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib24",
 "doi-asserted-by": "crossref",
 "first-page": "80",
 "DOI": "10.1016/j.jas.2015.04.002",
 "article-title": "A supervised machine-learning approach towards geochemical predictive modelling in archaeology",
 "volume": "59",
 "author": "Oonk",
 "year": "2015",
 "journal-title": "J. Archaeol. Sci."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib25",
 "doi-asserted-by": "crossref",
 "first-page": "49",
 "DOI": "10.1016/j.isprsjprs.2012.07.005",
 "article-title": "Combining terrestrial stereophotogrammetry, DGPS and GIS-based 3D voxel modelling in the volumetric recording of archaeological features",
 "volume": "76",
 "author": "Orengo",
 "year": "2013",
 "journal-title": "ISPRS J. Photogrammetry Remote Sens."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib26",
 "doi-asserted-by": "crossref",
 "first-page": "100",
 "DOI": "10.1016/j.jas.2015.10.008",
 "article-title": "Photogrammetric re-discovery of the Eastern Thessalian hidden long-term landscapes",
 "volume": "64",
 "author": "Orengo",
 "year": "2015",
 "journal-title": "J. Archaeol. Sci."
 },
 {
 "issue": "3",
 "key": "10.1016/j.jas.2019.105013_bib27",
 "doi-asserted-by": "crossref",
 "first-page": "479",
 "DOI": "10.3764/aja.122.3.0479",
 "article-title": "Towards a definition of Minoan agro-pastoral landscapes: results of the survey at Palaikastro (Crete)",
 "volume": "122",
 "author": "Orengo",
 "year": "2018",
 "journal-title": "Am. J. Archaeol."
 },
 {
 "issue": "7",
 "key": "10.1016/j.jas.2019.105013_bib28",
 "doi-asserted-by": "crossref",
 "first-page": "735",
 "DOI": "10.3390/rs9070735",
 "article-title": "Large-scale, multi-temporal remote sensing of palaeo-river networks: a case study from Northwest India and its implications for the Indus civilisation",
 "volume": "9",
 "author": "Orengo",
 "year": "2017",
 "journal-title": "Remote Sens."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib29",
 "doi-asserted-by": "crossref",
 "first-page": "1361",
 "DOI": "10.1002/esp.4317",
 "article-title": "Multi-scale relief model (MSRM): a new algorithm for the visualization of subtle topographic change of variable size in digital elevation models",
 "volume": "43",
 "author": "Orengo",
 "year": "2018",
 "journal-title": "Earth Surf. Process. Landforms"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib30",
 "series-title": "Submitted to Proceedings of the National Academy of Sciences",
 "article-title": "Living on the edge of the desert: automated detection of archaeological mounds in Cholistan (Pakistan) using machine learning classification of multi-sensor multi-temporal satellite data",
 "author": "Orengo",
 "year": "2019"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib31",
 "first-page": "154",
 "article-title": "How many trees in a random forest?",
 "volume": "vol. 7376",
 "author": "Oshiro",
 "year": "2012"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib32",
 "article-title": "Decision-making in modern surveys",
 "volume": "ume 1",
 "author": "Plog",
 "year": "1978"
 },
 {
 "issue": "4",
 "key": "10.1016/j.jas.2019.105013_bib33",
 "doi-asserted-by": "crossref",
 "first-page": "100",
 "DOI": "10.3390/geosciences7040100",
 "article-title": "From above and on the ground: geospatial methods for recording endangered archaeology in the Middle East and north africa",
 "volume": "7",
 "author": "Rayne",
 "year": "2017",
 "journal-title": "Geosciences"
 },
 {
 "issue": "1",
 "key": "10.1016/j.jas.2019.105013_bib34",
 "doi-asserted-by": "crossref",
 "first-page": "1",
 "DOI": "10.1080/00438243.1978.9979712",
 "article-title": "The design of archaeological surveys",
 "volume": "10",
 "author": "Schiffer",
 "year": "1978",
 "journal-title": "World Archaeol."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib35",
 "series-title": "Experiments in the Collection and Analysis of Archaeological Survey Data: the East Hampshire Survey",
 "author": "Shennan",
 "year": "1985"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib36",
 "doi-asserted-by": "crossref",
 "first-page": "1066",
 "DOI": "10.1016/j.culher.2016.06.006",
 "article-title": "Drones over Mediterranean landscapes. The potential of small UAV's (drones) for site detection and heritage management in archaeological survey projects: a case study from Le Pianelle in the Tappino Valley, Molise (Italy)",
 "volume": "22",
 "author": "Stek",
 "year": "2016",
 "journal-title": "J. Cult. Herit."
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib37",
 "series-title": "Side-by-Side Survey. Comparative Regional Studies in the Mediterranean World",
 "first-page": "65",
 "article-title": "Side-by-side and back to front: exploring intra-regional latitudinal and longitudinal comparability in survey data. Three case studies from Metaponto, southern Italy",
 "author": "Thomson",
 "year": "2004"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib38",
 "series-title": "Digital Discovery. Exploring New Frontiers in Human Heritage. Computer Applications and Quantitative Methods in Archaeology",
 "article-title": "Computer vision and machine learning for archaeology",
 "author": "van der Maaten",
 "year": "2007"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib39",
 "doi-asserted-by": "crossref",
 "first-page": "1114",
 "DOI": "10.1111/j.1475-4754.2012.00667.x",
 "article-title": "Computer vision-based orthophoto mapping of complex archaeological sites: the ancient quarry of Pitaranha (Portugal-Spain)",
 "volume": "54",
 "author": "Verhoeven",
 "year": "2012",
 "journal-title": "Archaeometry"
 },
 {
 "key": "10.1016/j.jas.2019.105013_bib40",
 "series-title": "A Guide for Salvage Archeology",
 "author": "Wendorf",
 "year": "1962"
 }
 ],
 "container-title": [
 "Journal of Archaeological Science"
 ],
 "original-title": [
 ],
 "language": "en",
 "link": [
 {
 "URL": "https://api.elsevier.com/content/article/PII:S0305440319301001?httpAccept=text/xml",
 "content-type": "text/xml",
 "content-version": "vor",
 "intended-application": "text-mining"
 },
 {
 "URL": "https://api.elsevier.com/content/article/PII:S0305440319301001?httpAccept=text/plain",
 "content-type": "text/plain",
 "content-version": "vor",
 "intended-application": "text-mining"
 }
 ],
 "deposited": {
 "date-parts": [
 [
 2019,
 11,
 25
 ]
 ],
 "date-time": "2019-11-25T06:46:34Z",
 "timestamp": 1574664394000
 },
 "score": 1,
 "subtitle": [
 ],
 "short-title": [
 ],
 "issued": {
 "date-parts": [
 [
 2019,
 12
 ]
 ]
 },
 "references-count": 39,
 "alternative-id": [
 "S0305440319301001"
 ],
 "URL": "http://dx.doi.org/10.1016/j.jas.2019.105013",
 "relation": {
 },
 "ISSN": [
 "0305-4403"
 ],
 "issn-type": [
 {
 "value": "0305-4403",
 "type": "print"
 }
 ],
 "subject": [
 "Archaeology",
 "Archaeology"
 ],
 "published": {
 "date-parts": [
 [
 2019,
 12
 ]
 ]
 },
 "assertion": [
 {
 "value": "Elsevier",
 "name": "publisher",
 "label": "This article is maintained by"
 },
 {
 "value": "A brave new world for archaeological survey: Automated machine learning-based potsherd detection using high-resolution drone imagery",
 "name": "articletitle",
 "label": "Article Title"
 },
 {
 "value": "Journal of Archaeological Science",
 "name": "journaltitle",
 "label": "Journal Title"
 },
 {
 "value": "https://doi.org/10.1016/j.jas.2019.105013",
 "name": "articlelink",
 "label": "CrossRef DOI link to publisher maintained version"
 },
 {
 "value": "article",
 "name": "content_type",
 "label": "Content Type"
 },
 {
 "value": "© 2019 The Authors. Published by Elsevier Ltd.",
 "name": "copyright",
 "label": "Copyright"
 }
 ],
 "article-number": "105013"
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala
@ -0,0 +1,107 @@
 package eu.dnetlib.dhp.oa.graph.resolution
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.application.ArgumentApplicationParser
 import eu.dnetlib.dhp.common.HdfsSupport
 import eu.dnetlib.dhp.schema.common.EntityType
 import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.SparkConf
 import org.apache.spark.sql._
 import org.slf4j.{Logger, LoggerFactory}
 object SparkResolveEntities {
  val mapper = new ObjectMapper()
  val entities = List(EntityType.dataset,EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
  def main(args: Array[String]): Unit = {
    val log: Logger = LoggerFactory.getLogger(getClass)
    val conf: SparkConf = new SparkConf()
    val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_params.json")))
    parser.parseArgument(args)
    val spark: SparkSession =
      SparkSession
        .builder()
        .config(conf)
        .appName(getClass.getSimpleName)
        .master(parser.get("master")).getOrCreate()
    val graphBasePath = parser.get("graphBasePath")
    log.info(s"graphBasePath  -> $graphBasePath")
    val workingPath = parser.get("workingPath")
    log.info(s"workingPath  -> $workingPath")
    val unresolvedPath = parser.get("unresolvedPath")
    log.info(s"unresolvedPath  -> $unresolvedPath")
    val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
    fs.mkdirs(new Path(workingPath))
    resolveEntities(spark, workingPath, unresolvedPath)
    generateResolvedEntities(spark, workingPath, graphBasePath)
    // TO BE conservative we keep the original entities in the working dir
    // and save the resolved entities on the graphBasePath
    //In future these lines of code should be removed
    entities.foreach {
      e =>
        fs.rename(new Path(s"$graphBasePath/$e"), new Path(s"$workingPath/${e}_old"))
        fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e"))
    }
 }
 def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
    implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    import spark.implicits._
    val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
    val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
    rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map {
      r =>
        val result = r._2._2
        val dnetId = r._1._1
        result.setId(dnetId)
        result
    }.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities")
  }
  def deserializeObject(input:String,  entity:EntityType ) :Result = {
   entity match {
     case EntityType.publication => mapper.readValue(input, classOf[Publication])
     case EntityType.dataset => mapper.readValue(input, classOf[OafDataset])
     case EntityType.software=> mapper.readValue(input, classOf[Software])
     case EntityType.otherresearchproduct=> mapper.readValue(input, classOf[OtherResearchProduct])
   }
  }
 def generateResolvedEntities(spark:SparkSession,  workingPath: String, graphBasePath:String) = {
    implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    import spark.implicits._
    val re:Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result]
    entities.foreach {
      e =>
        spark.read.text(s"$graphBasePath/$e").as[String]
          .map(s => deserializeObject(s, e))
          .union(re)
          .groupByKey(_.getId)
          .reduceGroups {
            (x, y) =>
              x.mergeFrom(y)
              x
          }.map(_._2)
          .filter(r => r.getClass.getSimpleName.toLowerCase != "result")
          .map(r => mapper.writeValueAsString(r))(Encoders.STRING)
          .write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingPath/resolvedGraph/$e")
    }
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala
@ -96,6 +96,21 @@ object SparkResolveRelation {
      .text(s"$graphBasePath/relation")
  }
  def extractInstanceCF(input: String): List[(String, String)] = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
    lazy val json: json4s.JValue = parse(input)
    val result: List[(String, String)] = for {
      JObject(iObj) <- json \ "instance"
      JField("collectedfrom", JObject(cf)) <- iObj
      JField("instancetype", JObject(instancetype)) <- iObj
      JField("value", JString(collectedFrom)) <- cf
      JField("classname", JString(classname)) <- instancetype
    } yield (classname, collectedFrom)
    result
  }
  def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
    implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -108,14 +123,7 @@ object SparkResolveRelation {
      JField("classid", JString(pidType)) <- qualifier
    } yield (pidValue, pidType)
-    val alternateIds: List[(String, String)] = for {
+    (id, result)
      JObject(pids) <- json \\ "alternateIdentifier"
      JField("value", JString(pidValue)) <- pids
      JField("qualifier", JObject(qualifier)) <- pids
      JField("classid", JString(pidType)) <- qualifier
    } yield (pidValue, pidType)
    (id, result ::: alternateIds)
  }
@ -128,7 +136,7 @@ object SparkResolveRelation {
    source != null
  }
-  private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
+  def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
    import spark.implicits._
    val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*")
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala
@ -59,7 +59,12 @@ object SparkConvertRDDtoDataset {
    log.info("Converting Relation")
-    val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation").map(s => mapper.readValue(s, classOf[Relation])).filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
+    val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin")
    val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation")
      .map(s => mapper.readValue(s, classOf[Relation]))
      .filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
      .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
    spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml
@ -4,6 +4,10 @@
            <name>graphBasePath</name>
            <description>the path of the graph</description>
        </property>
        <property>
            <name>unresolvedPath</name>
            <description>the path of the unresolved Entities</description>
        </property>
    </parameters>
    <start to="ResolveRelations"/>
@ -36,5 +40,33 @@
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <action name="ResolveEntities">
        <spark xmlns="uri:oozie:spark-action:0.2">
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Resolve Relations in raw graph</name>
            <class>eu.dnetlib.dhp.oa.graph.resolution.SparkResolveEntities</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
                --executor-memory=${sparkExecutorMemory}
                --executor-cores=${sparkExecutorCores}
                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.shuffle.partitions=10000
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
            </spark-opts>
            <arg>--master</arg><arg>yarn</arg>
            <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
            <arg>--unresolvedPath</arg><arg>${unresolvedPath}</arg>
            <arg>--workingPath</arg><arg>${workingDir}</arg>
        </spark>
        <ok to="End"/>
        <error to="Kill"/>
    </action>
    <end name="End"/>
 </workflow-app>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json
@ -0,0 +1,6 @@
 [
  {"paramName":"mt",  "paramLongName":"master",       "paramDescription": "should be local or yarn",    "paramRequired": true},
  {"paramName":"w",   "paramLongName":"workingPath",  "paramDescription": "the source Path",            "paramRequired": true},
  {"paramName":"u",   "paramLongName":"unresolvedPath",  "paramDescription": "the source Path",            "paramRequired": true},
  {"paramName":"g",   "paramLongName":"graphBasePath",   "paramDescription": "the path of the raw graph",  "paramRequired": true}
 ]
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala
@ -0,0 +1,190 @@
 package eu.dnetlib.dhp.oa.graph.resolution
 import com.fasterxml.jackson.databind.ObjectMapper
 import eu.dnetlib.dhp.schema.common.EntityType
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
 import eu.dnetlib.dhp.schema.oaf.{Result, StructuredProperty}
 import org.apache.commons.io.FileUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql._
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.TestInstance.Lifecycle
 import org.junit.jupiter.api.{AfterAll, BeforeAll, Test, TestInstance}
 import java.nio.file.{Files, Path}
 import scala.collection.JavaConverters._
 import scala.io.Source
@TestInstance(Lifecycle.PER_CLASS)
 class ResolveEntitiesTest extends Serializable {
  var workingDir:Path = null
  val FAKE_TITLE = "FAKETITLE"
  val FAKE_SUBJECT = "FAKESUBJECT"
  var sparkSession:Option[SparkSession] = None
  @BeforeAll
  def setUp() :Unit = {
    workingDir = Files.createTempDirectory(getClass.getSimpleName)
    val conf = new SparkConf()
    sparkSession = Some(SparkSession
      .builder()
      .config(conf)
      .appName(getClass.getSimpleName)
      .master("local[*]").getOrCreate())
    populateDatasets(sparkSession.get)
    generateUpdates(sparkSession.get)
  }
  @AfterAll
  def tearDown():Unit = {
    FileUtils.deleteDirectory(workingDir.toFile)
    sparkSession.get.stop()
  }
  def generateUpdates(spark:SparkSession):Unit = {
    val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
    val pids:List[String] = template.lines.map{id =>
      val r = new Result
      r.setId(id.toLowerCase.trim)
      r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
      r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
      r
    }.map{r =>
      val mapper = new ObjectMapper()
      mapper.writeValueAsString(r)}.toList
    val sc =spark.sparkContext
    println(sc.parallelize(pids).count())
    spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates")
    import spark.implicits._
    implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
    val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper()
     mapper.readValue(s, classOf[Result])}.collect()
    assertEquals(4, ds.length)
    ds.foreach{r => assertNotNull(r.getSubject)}
    ds.foreach{r => assertEquals(1,r.getSubject.size())}
    ds.foreach{r => assertNotNull(r.getTitle)}
    ds.foreach{r => assertEquals(1,r.getTitle.size())}
    ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t))
    ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t))
    println("generated Updates")
  }
  def populateDatasets(spark:SparkSession):Unit = {
    import spark.implicits._
    val entities =SparkResolveEntities.entities
    entities.foreach{
      e =>
        val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
        spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e")
        println(s"Created Dataset $e")
    }
    SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work")
  }
  @Test
  def testResolution():Unit = {
    val spark:SparkSession = sparkSession.get
    implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
    val ds = spark.read.load(s"$workingDir/work/resolvedEntities").as[Result]
    assertEquals(3, ds.count())
    ds.collect().foreach{
      r =>
      assertTrue(r.getId.startsWith("50"))
    }
  }
  private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = {
    l.asScala.exists(p =>p.getValue!= null && p.getValue.equalsIgnoreCase(exptectedValue))
  }
  @Test
  def testUpdate():Unit = {
    val spark:SparkSession = sparkSession.get
    import spark.implicits._
    implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
    val m = new ObjectMapper()
    SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
    SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph" )
    val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
    val t  = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
    val datDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
    val td  = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
    val softDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
    val ts  = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
    val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
    val to  = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
    assertEquals(0, t)
    assertEquals(2, td)
    assertEquals(1, ts)
    assertEquals(0, to)
  }
 }
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/otherresearchproduct
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/otherresearchproduct
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/publication
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/publication
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/software
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/software
--- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/updates
+++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/updates
@ -0,0 +1,4 @@
 unresolved::10.17026/dans-x3z-fsq5::doi
 unresolved::10.17026/dans-xsw-qtnx::doi
 unresolved::10.5281/zenodo.1473694::doi
 unresolved::10.17632/fake::doi
--- a/pom.xml
+++ b/pom.xml
@ -753,7 +753,7 @@
 		<mockito-core.version>3.3.3</mockito-core.version>
 		<mongodb.driver.version>3.4.2</mongodb.driver.version>
 		<vtd.version>[2.12,3.0)</vtd.version>
-		<dhp-schemas.version>[2.8.22-SNAPSHOT]</dhp-schemas.version>
+		<dhp-schemas.version>[2.8.22]</dhp-schemas.version>
 		<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
 		<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
 		<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>