From 2480e590d1c5b440c336febc51fa3f53270e3dbf Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 14:25:23 +0100 Subject: [PATCH 01/10] [DOIBoost - Mapping] changed the type on which to map dissertation from Crossref: from 006 Doctoral thesis to 0044 Thesis since dissertation could be either Doctoral or master thesis --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 25f0ff381..dc9e18fde 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -70,7 +70,7 @@ case object Crossref2Oaf { "reference-book" -> "0002 Book", "monograph" -> "0002 Book", "journal-article" -> "0001 Article", - "dissertation" -> "0006 Doctoral thesis", + "dissertation" -> "0044 Thesis", "other" -> "0038 Other literature type", "peer-review" -> "0015 Review", "proceedings" -> "0004 Conference object", From 779318961cd4e7a1b24990229caf74a1775230ac Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 14:38:52 +0100 Subject: [PATCH 02/10] [DOIBoost - Mapping] removed the url from crossref containing the api.elsevier.com... string in the url --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index dc9e18fde..53a9e8bd4 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -207,7 +207,7 @@ case object Crossref2Oaf { instance.setDateofacceptance(asField(createdDate.getValue)) } val s: String = (json \ "URL").extract[String] - val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null).distinct + val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && !p.contains("api.elsevier.com...")).distinct if (links.nonEmpty) { instance.setUrl(links.asJava) } From b2bb8d9d7908508f7d7abaf4d0d5d246fb38a42d Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 15:44:57 +0100 Subject: [PATCH 03/10] [DOIBoost - Mapping] selecting the url from Crossref containing the doi --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 53a9e8bd4..51637f5bb 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -207,7 +207,7 @@ case object Crossref2Oaf { instance.setDateofacceptance(asField(createdDate.getValue)) } val s: String = (json \ "URL").extract[String] - val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && !p.contains("api.elsevier.com...")).distinct + val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct if (links.nonEmpty) { instance.setUrl(links.asJava) } From 683fe093cff2a76fb42d27c52c5c61831f48c268 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 15:51:26 +0100 Subject: [PATCH 04/10] [DOIBoost - Mapping] Remove the addition of the instance to the MAG publication record --- .../doiboost/mag/SparkProcessMAG.scala | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index ecb389af8..c011cbd20 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -111,22 +111,24 @@ object SparkProcessMAG { .map(item => ConversionUtil.updatePubsWithConferenceInfo(item)) .write .mode(SaveMode.Overwrite) - .save(s"$workingPath/merge_step_2_conference") - - - magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] - - val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl] - - - logger.info("Phase 5) enrich publication with URL and Instances") - magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left") - .map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) } - .write.mode(SaveMode.Overwrite) .save(s"$workingPath/merge_step_3") + //no more needed to add the instance to mag records +// magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication] +// .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] +// +// val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl] +// +// +// +// logger.info("Phase 5) enrich publication with URL and Instances") +// magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left") +// .map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) } +// .write.mode(SaveMode.Overwrite) +// .save(s"$workingPath/merge_step_3") + + // logger.info("Phase 6) Enrich Publication with description") // val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract] // pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract") From 96769b44816bbebbeee2bf47c03e9079f83ea441 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 16:43:36 +0100 Subject: [PATCH 05/10] [DOIBoost - Mapping] Changed the logic which brought in in the instance urls that should not be there: The urld of the doi in the json is reachable from the root (json/"URL") other urls where added from the links element. Now the mapping from the link element has been removed --- .../dnetlib/doiboost/crossref/Crossref2Oaf.scala | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 51637f5bb..4c06d283a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -206,11 +206,16 @@ case object Crossref2Oaf { else { instance.setDateofacceptance(asField(createdDate.getValue)) } - val s: String = (json \ "URL").extract[String] - val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct - if (links.nonEmpty) { - instance.setUrl(links.asJava) - } + val s: List[String] = List((json \ "URL").extract[String]) +// val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct +// if (links.nonEmpty) { +// instance.setUrl(links.asJava) +// } + if(s.nonEmpty) + { + instance.setUrl(s.asJava) + } + result.setInstance(List(instance).asJava) //IMPORTANT From d97ea82a2922846a2e03df84992df10cb479287c Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 16:45:15 +0100 Subject: [PATCH 06/10] [DOIBoost Mapping] Added test to verify the instance created for Crossref will have just the url related to the doi --- .../crossref/CrossrefMappingTest.scala | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index 75fb3f787..f6d5e124e 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -612,4 +612,26 @@ class CrossrefMappingTest { } + @Test + def testMultipleURLs() :Unit = { + val json = Source.fromInputStream(getClass.getResourceAsStream("multiple_urls.json")).mkString + + + assertNotNull(json) + assertFalse(json.isEmpty); + + val resultList: List[Oaf] = Crossref2Oaf.convert(json) + + assertTrue(resultList.nonEmpty) + + + val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] + + assertEquals(1, item.getInstance().size()) + assertEquals(1, item.getInstance().get(0).getUrl().size()) + assertEquals("http://dx.doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0)) + //println(mapper.writeValueAsString(item)) + + } + } From edf55395e97d92d9b8eb7c29184f0664b0ed7345 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 16:49:30 +0100 Subject: [PATCH 07/10] added test resourse --- .../doiboost/crossref/multiple_urls.json | 614 ++++++++++++++++++ 1 file changed, 614 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json new file mode 100644 index 000000000..5f90feac4 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json @@ -0,0 +1,614 @@ + +{ +"indexed": { +"date-parts": [ +[ +2021, +10, +31 +] +], +"date-time": "2021-10-31T15:48:01Z", +"timestamp": 1635695281393 +}, +"reference-count": 39, +"publisher": "Elsevier BV", +"license": [ +{ +"start": { +"date-parts": [ +[ +2019, +12, +1 +] +], +"date-time": "2019-12-01T00:00:00Z", +"timestamp": 1575158400000 +}, +"content-version": "tdm", +"delay-in-days": 0, +"URL": "https://www.elsevier.com/tdm/userlicense/1.0/" +}, +{ +"start": { +"date-parts": [ +[ +2019, +9, +13 +] +], +"date-time": "2019-09-13T00:00:00Z", +"timestamp": 1568332800000 +}, +"content-version": "vor", +"delay-in-days": 0, +"URL": "http://creativecommons.org/licenses/by/4.0/" +} +], +"funder": [ +{ +"DOI": "10.13039/100001182", +"name": "INSTAP", +"doi-asserted-by": "publisher" +}, +{ +"DOI": "10.13039/100014440", +"name": "Ministry of Science, Innovation and Universities", +"doi-asserted-by": "publisher", +"award": [ +"RYC-2016-19637" +] +}, +{ +"DOI": "10.13039/100010661", +"name": "European Union’s Horizon 2020", +"doi-asserted-by": "publisher", +"award": [ +"746446" +] +} +], +"content-domain": { +"domain": [ +"elsevier.com", +"sciencedirect.com" +], +"crossmark-restriction": true +}, +"short-container-title": [ +"Journal of Archaeological Science" +], +"published-print": { +"date-parts": [ +[ +2019, +12 +] +] +}, +"DOI": "10.1016/j.jas.2019.105013", +"type": "journal-article", +"created": { +"date-parts": [ +[ +2019, +9, +25 +] +], +"date-time": "2019-09-25T20:05:08Z", +"timestamp": 1569441908000 +}, +"page": "105013", +"update-policy": "http://dx.doi.org/10.1016/elsevier_cm_policy", +"source": "Crossref", +"is-referenced-by-count": 21, +"title": [ +"A brave new world for archaeological survey: Automated machine learning-based potsherd detection using high-resolution drone imagery" +], +"prefix": "10.1016", +"volume": "112", +"author": [ +{ +"given": "H.A.", +"family": "Orengo", +"sequence": "first", +"affiliation": [ + +] +}, +{ +"given": "A.", +"family": "Garcia-Molsosa", +"sequence": "additional", +"affiliation": [ + +] +} +], +"member": "78", +"reference": [ +{ +"key": "10.1016/j.jas.2019.105013_bib1", +"doi-asserted-by": "crossref", +"first-page": "85", +"DOI": "10.1080/17538947.2016.1250829", +"article-title": "Remote sensing heritage in a petabyte-scale: satellite data and heritage Earth Engine© applications", +"volume": "10", +"author": "Agapiou", +"year": "2017", +"journal-title": "Int. J. Digit. Earth" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib2", +"series-title": "Extracting Meaning from Ploughsoil Assemblages", +"first-page": "1", +"article-title": "Extracting meaning from ploughsoil assemblages: assessments of the past, strategies for the future", +"author": "Alcock", +"year": "2000" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib3", +"series-title": "Side-by-Side Survey. Comparative Regional Studies in the Mediterranean World", +"first-page": "1", +"article-title": "Introduction", +"author": "Alcock", +"year": "2004" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib4", +"doi-asserted-by": "crossref", +"first-page": "93", +"DOI": "10.1111/j.1538-4632.1995.tb00338.x", +"article-title": "Local indicators of spatial association—LISA", +"volume": "27", +"author": "Anselin", +"year": "1995", +"journal-title": "Geogr. Anal." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib5", +"series-title": "Archaeological Survey", +"author": "Banning", +"year": "2002" +}, +{ +"issue": "1/2", +"key": "10.1016/j.jas.2019.105013_bib6", +"doi-asserted-by": "crossref", +"first-page": "123", +"DOI": "10.2307/3181488", +"article-title": "GIS, archaeological survey and landscape archaeology on the island of Kythera, Greece", +"volume": "29", +"author": "Bevan", +"year": "2004", +"journal-title": "J. Field Archaeol." +}, +{ +"issue": "1", +"key": "10.1016/j.jas.2019.105013_bib8", +"doi-asserted-by": "crossref", +"first-page": "5", +"DOI": "10.1023/A:1010933404324", +"article-title": "Random forests", +"volume": "45", +"author": "Breiman", +"year": "2001", +"journal-title": "Mach. Learn." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib9", +"series-title": "Sampling in Contemporary British Archaeology", +"author": "Cherry", +"year": "1978" +}, +{ +"issue": "3", +"key": "10.1016/j.jas.2019.105013_bib10", +"doi-asserted-by": "crossref", +"first-page": "273", +"DOI": "10.1016/0734-189X(84)90197-X", +"article-title": "Segmentation of a high-resolution urban scene using texture operators", +"volume": "25", +"author": "Conners", +"year": "1984", +"journal-title": "Comput. Vis. Graph Image Process" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib11", +"first-page": "31", +"article-title": "Old land surfaces and modern ploughsoil: implications of recent work at Maxey, Cambridgeshire", +"volume": "2", +"author": "Crowther", +"year": "1983", +"journal-title": "Scott. Archaeol. Rev." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib12", +"series-title": "Settlement Pattern Studies in the Americas: Fifty Years since Virú", +"first-page": "203", +"article-title": "Conclusions: the settlement pattern concept from an Americanist perspective", +"author": "Fish", +"year": "1999" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib13", +"doi-asserted-by": "crossref", +"first-page": "21", +"DOI": "10.3390/geosciences9010021", +"article-title": "Remote sensing and historical morphodynamics of alluvial plains. The 1909 indus flood and the city of Dera Gazhi Khan (province of Punjab, Pakistan)", +"volume": "9", +"author": "Garcia", +"year": "2019", +"journal-title": "Geosciences" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib14", +"unstructured": "Georgiadis, M.; Garcia-Molsosa, A.; Orengo, H.A.; Kefalidou, E. and Kallintzi, K. In Preparation. APAX Project 2015-2018: A Preliminary Report. (Hesperia)." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib15", +"series-title": "Geographical Information Systems and Landscape Archaeology", +"first-page": "35", +"article-title": "Regional survey and GIS: the boeotia project", +"author": "Gillings", +"year": "1999" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib16", +"doi-asserted-by": "crossref", +"first-page": "18", +"DOI": "10.1016/j.rse.2017.06.031", +"article-title": "Google Earth engine: planetary-scale geospatial analysis for everyone", +"volume": "202", +"author": "Gorelick", +"year": "2017", +"journal-title": "Remote Sens. Environ." +}, +{ +"issue": "107", +"key": "10.1016/j.jas.2019.105013_bib17", +"doi-asserted-by": "crossref", +"first-page": "177", +"DOI": "10.1111/j.0031-868X.2004.00278.x", +"article-title": "Photogrammetric reconstruction of the great buddha of Bamiyan, Afghanistan", +"volume": "19", +"author": "Grün", +"year": "2004", +"journal-title": "Photogramm. Rec." +}, +{ +"issue": "6", +"key": "10.1016/j.jas.2019.105013_bib18", +"doi-asserted-by": "crossref", +"first-page": "610", +"DOI": "10.1109/TSMC.1973.4309314", +"article-title": "Textural features for image classification", +"author": "Haralick", +"year": "1973", +"journal-title": "IEEE Trans. Syst., Man, Cybernet., SMC-3" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib19", +"doi-asserted-by": "crossref", +"first-page": "76", +"DOI": "10.1558/jmea.v14i1.76", +"article-title": "Excavating to excess? Implications of the last decade of archaeology in Israel", +"volume": "14", +"author": "Kletter", +"year": "2001", +"journal-title": "J. Mediterr. Archaeol." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib20", +"first-page": "299", +"article-title": "Testing Google Earth Engine for the automatic identification and vectorization of archaeological features: a case study from Faynan, Jordan", +"volume": "15", +"author": "Liss", +"year": "2017", +"journal-title": "J. Archaeol. Sci.: Report" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib21", +"series-title": "Geographical Information Systems and Landscape Archaeology", +"first-page": "55", +"article-title": "Towards a methodology for modelling surface survey data: the sangro valley project", +"author": "Lock", +"year": "1999" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib22", +"series-title": "Extracting Meaning from Ploughsoil Assemblages", +"first-page": "5", +"article-title": "Methods of collection recording and quantification", +"author": "Mattingly", +"year": "2000" +}, +{ +"issue": "14", +"key": "10.1016/j.jas.2019.105013_bib23", +"doi-asserted-by": "crossref", +"first-page": "E778", +"DOI": "10.1073/pnas.1115472109", +"article-title": "Mapping patterns of long-term settlement in Northern Mesopotamia at a large scale", +"volume": "109", +"author": "Menze", +"year": "2012", +"journal-title": "Proc. Natl. Acad. Sci." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib24", +"doi-asserted-by": "crossref", +"first-page": "80", +"DOI": "10.1016/j.jas.2015.04.002", +"article-title": "A supervised machine-learning approach towards geochemical predictive modelling in archaeology", +"volume": "59", +"author": "Oonk", +"year": "2015", +"journal-title": "J. Archaeol. Sci." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib25", +"doi-asserted-by": "crossref", +"first-page": "49", +"DOI": "10.1016/j.isprsjprs.2012.07.005", +"article-title": "Combining terrestrial stereophotogrammetry, DGPS and GIS-based 3D voxel modelling in the volumetric recording of archaeological features", +"volume": "76", +"author": "Orengo", +"year": "2013", +"journal-title": "ISPRS J. Photogrammetry Remote Sens." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib26", +"doi-asserted-by": "crossref", +"first-page": "100", +"DOI": "10.1016/j.jas.2015.10.008", +"article-title": "Photogrammetric re-discovery of the Eastern Thessalian hidden long-term landscapes", +"volume": "64", +"author": "Orengo", +"year": "2015", +"journal-title": "J. Archaeol. Sci." +}, +{ +"issue": "3", +"key": "10.1016/j.jas.2019.105013_bib27", +"doi-asserted-by": "crossref", +"first-page": "479", +"DOI": "10.3764/aja.122.3.0479", +"article-title": "Towards a definition of Minoan agro-pastoral landscapes: results of the survey at Palaikastro (Crete)", +"volume": "122", +"author": "Orengo", +"year": "2018", +"journal-title": "Am. J. Archaeol." +}, +{ +"issue": "7", +"key": "10.1016/j.jas.2019.105013_bib28", +"doi-asserted-by": "crossref", +"first-page": "735", +"DOI": "10.3390/rs9070735", +"article-title": "Large-scale, multi-temporal remote sensing of palaeo-river networks: a case study from Northwest India and its implications for the Indus civilisation", +"volume": "9", +"author": "Orengo", +"year": "2017", +"journal-title": "Remote Sens." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib29", +"doi-asserted-by": "crossref", +"first-page": "1361", +"DOI": "10.1002/esp.4317", +"article-title": "Multi-scale relief model (MSRM): a new algorithm for the visualization of subtle topographic change of variable size in digital elevation models", +"volume": "43", +"author": "Orengo", +"year": "2018", +"journal-title": "Earth Surf. Process. Landforms" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib30", +"series-title": "Submitted to Proceedings of the National Academy of Sciences", +"article-title": "Living on the edge of the desert: automated detection of archaeological mounds in Cholistan (Pakistan) using machine learning classification of multi-sensor multi-temporal satellite data", +"author": "Orengo", +"year": "2019" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib31", +"first-page": "154", +"article-title": "How many trees in a random forest?", +"volume": "vol. 7376", +"author": "Oshiro", +"year": "2012" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib32", +"article-title": "Decision-making in modern surveys", +"volume": "ume 1", +"author": "Plog", +"year": "1978" +}, +{ +"issue": "4", +"key": "10.1016/j.jas.2019.105013_bib33", +"doi-asserted-by": "crossref", +"first-page": "100", +"DOI": "10.3390/geosciences7040100", +"article-title": "From above and on the ground: geospatial methods for recording endangered archaeology in the Middle East and north africa", +"volume": "7", +"author": "Rayne", +"year": "2017", +"journal-title": "Geosciences" +}, +{ +"issue": "1", +"key": "10.1016/j.jas.2019.105013_bib34", +"doi-asserted-by": "crossref", +"first-page": "1", +"DOI": "10.1080/00438243.1978.9979712", +"article-title": "The design of archaeological surveys", +"volume": "10", +"author": "Schiffer", +"year": "1978", +"journal-title": "World Archaeol." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib35", +"series-title": "Experiments in the Collection and Analysis of Archaeological Survey Data: the East Hampshire Survey", +"author": "Shennan", +"year": "1985" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib36", +"doi-asserted-by": "crossref", +"first-page": "1066", +"DOI": "10.1016/j.culher.2016.06.006", +"article-title": "Drones over Mediterranean landscapes. The potential of small UAV's (drones) for site detection and heritage management in archaeological survey projects: a case study from Le Pianelle in the Tappino Valley, Molise (Italy)", +"volume": "22", +"author": "Stek", +"year": "2016", +"journal-title": "J. Cult. Herit." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib37", +"series-title": "Side-by-Side Survey. Comparative Regional Studies in the Mediterranean World", +"first-page": "65", +"article-title": "Side-by-side and back to front: exploring intra-regional latitudinal and longitudinal comparability in survey data. Three case studies from Metaponto, southern Italy", +"author": "Thomson", +"year": "2004" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib38", +"series-title": "Digital Discovery. Exploring New Frontiers in Human Heritage. Computer Applications and Quantitative Methods in Archaeology", +"article-title": "Computer vision and machine learning for archaeology", +"author": "van der Maaten", +"year": "2007" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib39", +"doi-asserted-by": "crossref", +"first-page": "1114", +"DOI": "10.1111/j.1475-4754.2012.00667.x", +"article-title": "Computer vision-based orthophoto mapping of complex archaeological sites: the ancient quarry of Pitaranha (Portugal-Spain)", +"volume": "54", +"author": "Verhoeven", +"year": "2012", +"journal-title": "Archaeometry" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib40", +"series-title": "A Guide for Salvage Archeology", +"author": "Wendorf", +"year": "1962" +} +], +"container-title": [ +"Journal of Archaeological Science" +], +"original-title": [ + +], +"language": "en", +"link": [ +{ +"URL": "https://api.elsevier.com/content/article/PII:S0305440319301001?httpAccept=text/xml", +"content-type": "text/xml", +"content-version": "vor", +"intended-application": "text-mining" +}, +{ +"URL": "https://api.elsevier.com/content/article/PII:S0305440319301001?httpAccept=text/plain", +"content-type": "text/plain", +"content-version": "vor", +"intended-application": "text-mining" +} +], +"deposited": { +"date-parts": [ +[ +2019, +11, +25 +] +], +"date-time": "2019-11-25T06:46:34Z", +"timestamp": 1574664394000 +}, +"score": 1, +"subtitle": [ + +], +"short-title": [ + +], +"issued": { +"date-parts": [ +[ +2019, +12 +] +] +}, +"references-count": 39, +"alternative-id": [ +"S0305440319301001" +], +"URL": "http://dx.doi.org/10.1016/j.jas.2019.105013", +"relation": { + +}, +"ISSN": [ +"0305-4403" +], +"issn-type": [ +{ +"value": "0305-4403", +"type": "print" +} +], +"subject": [ +"Archaeology", +"Archaeology" +], +"published": { +"date-parts": [ +[ +2019, +12 +] +] +}, +"assertion": [ +{ +"value": "Elsevier", +"name": "publisher", +"label": "This article is maintained by" +}, +{ +"value": "A brave new world for archaeological survey: Automated machine learning-based potsherd detection using high-resolution drone imagery", +"name": "articletitle", +"label": "Article Title" +}, +{ +"value": "Journal of Archaeological Science", +"name": "journaltitle", +"label": "Journal Title" +}, +{ +"value": "https://doi.org/10.1016/j.jas.2019.105013", +"name": "articlelink", +"label": "CrossRef DOI link to publisher maintained version" +}, +{ +"value": "article", +"name": "content_type", +"label": "Content Type" +}, +{ +"value": "© 2019 The Authors. Published by Elsevier Ltd.", +"name": "copyright", +"label": "Copyright" +} +], +"article-number": "105013" +} From d50057b2d92ab99407461d37490bff19b6608346 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 3 Nov 2021 16:59:37 +0100 Subject: [PATCH 08/10] [DOIBoost Mapping] changed the way to create the url for the instance: we use the crooref guidelines https://doi.org/doi --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 2 +- .../java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 4c06d283a..1b1c850ba 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -206,7 +206,7 @@ case object Crossref2Oaf { else { instance.setDateofacceptance(asField(createdDate.getValue)) } - val s: List[String] = List((json \ "URL").extract[String]) + val s: List[String] = List("https://doi.org/" + doi) // val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct // if (links.nonEmpty) { // instance.setUrl(links.asJava) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index f6d5e124e..5ef92cfa4 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -629,7 +629,7 @@ class CrossrefMappingTest { assertEquals(1, item.getInstance().size()) assertEquals(1, item.getInstance().get(0).getUrl().size()) - assertEquals("http://dx.doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0)) + assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0)) //println(mapper.writeValueAsString(item)) } From de63d29b6f5b6cdc0a45df94f5ade5f9c4c28711 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 4 Nov 2021 16:16:40 +0100 Subject: [PATCH 09/10] [DOIBoost Mapping] Fix to avoid to produce results with null as identifier (probably due to the filtering function in the factory for the creation of the id) --- .../dnetlib/doiboost/mag/SparkProcessMAG.scala | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index c011cbd20..fa3be973d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -164,12 +164,18 @@ object SparkProcessMAG { .write.mode(SaveMode.Overwrite) .save(s"$workingPath/mag_publication") + spark.read.load(s"$workingPath/mag_publication").as[Publication] + .filter(p => p.getId == null) + .groupByKey(p => p.getId) + .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) + .map(_._2) + .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") - val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication] - .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) - .map(_._2) - - spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") +// val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication] +// .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) +// .map(_._2) +// +// spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") } } From df7ee77c7afd7af06f90ebaac447decc665f2428 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 4 Nov 2021 16:24:07 +0100 Subject: [PATCH 10/10] [DOIBoost Mapping] removed not needed comments --- .../doiboost/mag/SparkProcessMAG.scala | 25 +------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index fa3be973d..016279787 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -114,25 +114,6 @@ object SparkProcessMAG { .save(s"$workingPath/merge_step_3") - //no more needed to add the instance to mag records -// magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication] -// .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] -// -// val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl] -// -// -// -// logger.info("Phase 5) enrich publication with URL and Instances") -// magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left") -// .map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) } -// .write.mode(SaveMode.Overwrite) -// .save(s"$workingPath/merge_step_3") - - -// logger.info("Phase 6) Enrich Publication with description") -// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract] -// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract") - val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract] @@ -171,11 +152,7 @@ object SparkProcessMAG { .map(_._2) .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") -// val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication] -// .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) -// .map(_._2) -// -// spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") + } }