1
0
Fork 0

merge with beta - resolved conflict in pom

This commit is contained in:
Miriam Baglioni 2021-11-12 10:19:59 +01:00
commit ffb0ce1d59
16 changed files with 1025 additions and 38 deletions

View File

@ -70,7 +70,7 @@ case object Crossref2Oaf {
"reference-book" -> "0002 Book", "reference-book" -> "0002 Book",
"monograph" -> "0002 Book", "monograph" -> "0002 Book",
"journal-article" -> "0001 Article", "journal-article" -> "0001 Article",
"dissertation" -> "0006 Doctoral thesis", "dissertation" -> "0044 Thesis",
"other" -> "0038 Other literature type", "other" -> "0038 Other literature type",
"peer-review" -> "0015 Review", "peer-review" -> "0015 Review",
"proceedings" -> "0004 Conference object", "proceedings" -> "0004 Conference object",
@ -206,11 +206,16 @@ case object Crossref2Oaf {
else { else {
instance.setDateofacceptance(asField(createdDate.getValue)) instance.setDateofacceptance(asField(createdDate.getValue))
} }
val s: String = (json \ "URL").extract[String] val s: List[String] = List("https://doi.org/" + doi)
val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null).distinct // val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct
if (links.nonEmpty) { // if (links.nonEmpty) {
instance.setUrl(links.asJava) // instance.setUrl(links.asJava)
// }
if(s.nonEmpty)
{
instance.setUrl(s.asJava)
} }
result.setInstance(List(instance).asJava) result.setInstance(List(instance).asJava)
//IMPORTANT //IMPORTANT

View File

@ -111,26 +111,9 @@ object SparkProcessMAG {
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item)) .map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
.write .write
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_2_conference")
magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
logger.info("Phase 5) enrich publication with URL and Instances")
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
.write.mode(SaveMode.Overwrite)
.save(s"$workingPath/merge_step_3") .save(s"$workingPath/merge_step_3")
// logger.info("Phase 6) Enrich Publication with description")
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract] val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
@ -162,12 +145,14 @@ object SparkProcessMAG {
.write.mode(SaveMode.Overwrite) .write.mode(SaveMode.Overwrite)
.save(s"$workingPath/mag_publication") .save(s"$workingPath/mag_publication")
spark.read.load(s"$workingPath/mag_publication").as[Publication]
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication] .filter(p => p.getId == null)
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) .groupByKey(p => p.getId)
.reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
.map(_._2) .map(_._2)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
} }
} }

View File

@ -612,4 +612,26 @@ class CrossrefMappingTest {
} }
@Test
def testMultipleURLs() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("multiple_urls.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result]
assertEquals(1, item.getInstance().size())
assertEquals(1, item.getInstance().get(0).getUrl().size())
assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0))
//println(mapper.writeValueAsString(item))
}
} }

View File

@ -0,0 +1,614 @@
{
"indexed": {
"date-parts": [
[
2021,
10,
31
]
],
"date-time": "2021-10-31T15:48:01Z",
"timestamp": 1635695281393
},
"reference-count": 39,
"publisher": "Elsevier BV",
"license": [
{
"start": {
"date-parts": [
[
2019,
12,
1
]
],
"date-time": "2019-12-01T00:00:00Z",
"timestamp": 1575158400000
},
"content-version": "tdm",
"delay-in-days": 0,
"URL": "https://www.elsevier.com/tdm/userlicense/1.0/"
},
{
"start": {
"date-parts": [
[
2019,
9,
13
]
],
"date-time": "2019-09-13T00:00:00Z",
"timestamp": 1568332800000
},
"content-version": "vor",
"delay-in-days": 0,
"URL": "http://creativecommons.org/licenses/by/4.0/"
}
],
"funder": [
{
"DOI": "10.13039/100001182",
"name": "INSTAP",
"doi-asserted-by": "publisher"
},
{
"DOI": "10.13039/100014440",
"name": "Ministry of Science, Innovation and Universities",
"doi-asserted-by": "publisher",
"award": [
"RYC-2016-19637"
]
},
{
"DOI": "10.13039/100010661",
"name": "European Unions Horizon 2020",
"doi-asserted-by": "publisher",
"award": [
"746446"
]
}
],
"content-domain": {
"domain": [
"elsevier.com",
"sciencedirect.com"
],
"crossmark-restriction": true
},
"short-container-title": [
"Journal of Archaeological Science"
],
"published-print": {
"date-parts": [
[
2019,
12
]
]
},
"DOI": "10.1016/j.jas.2019.105013",
"type": "journal-article",
"created": {
"date-parts": [
[
2019,
9,
25
]
],
"date-time": "2019-09-25T20:05:08Z",
"timestamp": 1569441908000
},
"page": "105013",
"update-policy": "http://dx.doi.org/10.1016/elsevier_cm_policy",
"source": "Crossref",
"is-referenced-by-count": 21,
"title": [
"A brave new world for archaeological survey: Automated machine learning-based potsherd detection using high-resolution drone imagery"
],
"prefix": "10.1016",
"volume": "112",
"author": [
{
"given": "H.A.",
"family": "Orengo",
"sequence": "first",
"affiliation": [
]
},
{
"given": "A.",
"family": "Garcia-Molsosa",
"sequence": "additional",
"affiliation": [
]
}
],
"member": "78",
"reference": [
{
"key": "10.1016/j.jas.2019.105013_bib1",
"doi-asserted-by": "crossref",
"first-page": "85",
"DOI": "10.1080/17538947.2016.1250829",
"article-title": "Remote sensing heritage in a petabyte-scale: satellite data and heritage Earth Engine© applications",
"volume": "10",
"author": "Agapiou",
"year": "2017",
"journal-title": "Int. J. Digit. Earth"
},
{
"key": "10.1016/j.jas.2019.105013_bib2",
"series-title": "Extracting Meaning from Ploughsoil Assemblages",
"first-page": "1",
"article-title": "Extracting meaning from ploughsoil assemblages: assessments of the past, strategies for the future",
"author": "Alcock",
"year": "2000"
},
{
"key": "10.1016/j.jas.2019.105013_bib3",
"series-title": "Side-by-Side Survey. Comparative Regional Studies in the Mediterranean World",
"first-page": "1",
"article-title": "Introduction",
"author": "Alcock",
"year": "2004"
},
{
"key": "10.1016/j.jas.2019.105013_bib4",
"doi-asserted-by": "crossref",
"first-page": "93",
"DOI": "10.1111/j.1538-4632.1995.tb00338.x",
"article-title": "Local indicators of spatial association—LISA",
"volume": "27",
"author": "Anselin",
"year": "1995",
"journal-title": "Geogr. Anal."
},
{
"key": "10.1016/j.jas.2019.105013_bib5",
"series-title": "Archaeological Survey",
"author": "Banning",
"year": "2002"
},
{
"issue": "1/2",
"key": "10.1016/j.jas.2019.105013_bib6",
"doi-asserted-by": "crossref",
"first-page": "123",
"DOI": "10.2307/3181488",
"article-title": "GIS, archaeological survey and landscape archaeology on the island of Kythera, Greece",
"volume": "29",
"author": "Bevan",
"year": "2004",
"journal-title": "J. Field Archaeol."
},
{
"issue": "1",
"key": "10.1016/j.jas.2019.105013_bib8",
"doi-asserted-by": "crossref",
"first-page": "5",
"DOI": "10.1023/A:1010933404324",
"article-title": "Random forests",
"volume": "45",
"author": "Breiman",
"year": "2001",
"journal-title": "Mach. Learn."
},
{
"key": "10.1016/j.jas.2019.105013_bib9",
"series-title": "Sampling in Contemporary British Archaeology",
"author": "Cherry",
"year": "1978"
},
{
"issue": "3",
"key": "10.1016/j.jas.2019.105013_bib10",
"doi-asserted-by": "crossref",
"first-page": "273",
"DOI": "10.1016/0734-189X(84)90197-X",
"article-title": "Segmentation of a high-resolution urban scene using texture operators",
"volume": "25",
"author": "Conners",
"year": "1984",
"journal-title": "Comput. Vis. Graph Image Process"
},
{
"key": "10.1016/j.jas.2019.105013_bib11",
"first-page": "31",
"article-title": "Old land surfaces and modern ploughsoil: implications of recent work at Maxey, Cambridgeshire",
"volume": "2",
"author": "Crowther",
"year": "1983",
"journal-title": "Scott. Archaeol. Rev."
},
{
"key": "10.1016/j.jas.2019.105013_bib12",
"series-title": "Settlement Pattern Studies in the Americas: Fifty Years since Virú",
"first-page": "203",
"article-title": "Conclusions: the settlement pattern concept from an Americanist perspective",
"author": "Fish",
"year": "1999"
},
{
"key": "10.1016/j.jas.2019.105013_bib13",
"doi-asserted-by": "crossref",
"first-page": "21",
"DOI": "10.3390/geosciences9010021",
"article-title": "Remote sensing and historical morphodynamics of alluvial plains. The 1909 indus flood and the city of Dera Gazhi Khan (province of Punjab, Pakistan)",
"volume": "9",
"author": "Garcia",
"year": "2019",
"journal-title": "Geosciences"
},
{
"key": "10.1016/j.jas.2019.105013_bib14",
"unstructured": "Georgiadis, M.; Garcia-Molsosa, A.; Orengo, H.A.; Kefalidou, E. and Kallintzi, K. In Preparation. APAX Project 2015-2018: A Preliminary Report. (Hesperia)."
},
{
"key": "10.1016/j.jas.2019.105013_bib15",
"series-title": "Geographical Information Systems and Landscape Archaeology",
"first-page": "35",
"article-title": "Regional survey and GIS: the boeotia project",
"author": "Gillings",
"year": "1999"
},
{
"key": "10.1016/j.jas.2019.105013_bib16",
"doi-asserted-by": "crossref",
"first-page": "18",
"DOI": "10.1016/j.rse.2017.06.031",
"article-title": "Google Earth engine: planetary-scale geospatial analysis for everyone",
"volume": "202",
"author": "Gorelick",
"year": "2017",
"journal-title": "Remote Sens. Environ."
},
{
"issue": "107",
"key": "10.1016/j.jas.2019.105013_bib17",
"doi-asserted-by": "crossref",
"first-page": "177",
"DOI": "10.1111/j.0031-868X.2004.00278.x",
"article-title": "Photogrammetric reconstruction of the great buddha of Bamiyan, Afghanistan",
"volume": "19",
"author": "Grün",
"year": "2004",
"journal-title": "Photogramm. Rec."
},
{
"issue": "6",
"key": "10.1016/j.jas.2019.105013_bib18",
"doi-asserted-by": "crossref",
"first-page": "610",
"DOI": "10.1109/TSMC.1973.4309314",
"article-title": "Textural features for image classification",
"author": "Haralick",
"year": "1973",
"journal-title": "IEEE Trans. Syst., Man, Cybernet., SMC-3"
},
{
"key": "10.1016/j.jas.2019.105013_bib19",
"doi-asserted-by": "crossref",
"first-page": "76",
"DOI": "10.1558/jmea.v14i1.76",
"article-title": "Excavating to excess? Implications of the last decade of archaeology in Israel",
"volume": "14",
"author": "Kletter",
"year": "2001",
"journal-title": "J. Mediterr. Archaeol."
},
{
"key": "10.1016/j.jas.2019.105013_bib20",
"first-page": "299",
"article-title": "Testing Google Earth Engine for the automatic identification and vectorization of archaeological features: a case study from Faynan, Jordan",
"volume": "15",
"author": "Liss",
"year": "2017",
"journal-title": "J. Archaeol. Sci.: Report"
},
{
"key": "10.1016/j.jas.2019.105013_bib21",
"series-title": "Geographical Information Systems and Landscape Archaeology",
"first-page": "55",
"article-title": "Towards a methodology for modelling surface survey data: the sangro valley project",
"author": "Lock",
"year": "1999"
},
{
"key": "10.1016/j.jas.2019.105013_bib22",
"series-title": "Extracting Meaning from Ploughsoil Assemblages",
"first-page": "5",
"article-title": "Methods of collection recording and quantification",
"author": "Mattingly",
"year": "2000"
},
{
"issue": "14",
"key": "10.1016/j.jas.2019.105013_bib23",
"doi-asserted-by": "crossref",
"first-page": "E778",
"DOI": "10.1073/pnas.1115472109",
"article-title": "Mapping patterns of long-term settlement in Northern Mesopotamia at a large scale",
"volume": "109",
"author": "Menze",
"year": "2012",
"journal-title": "Proc. Natl. Acad. Sci."
},
{
"key": "10.1016/j.jas.2019.105013_bib24",
"doi-asserted-by": "crossref",
"first-page": "80",
"DOI": "10.1016/j.jas.2015.04.002",
"article-title": "A supervised machine-learning approach towards geochemical predictive modelling in archaeology",
"volume": "59",
"author": "Oonk",
"year": "2015",
"journal-title": "J. Archaeol. Sci."
},
{
"key": "10.1016/j.jas.2019.105013_bib25",
"doi-asserted-by": "crossref",
"first-page": "49",
"DOI": "10.1016/j.isprsjprs.2012.07.005",
"article-title": "Combining terrestrial stereophotogrammetry, DGPS and GIS-based 3D voxel modelling in the volumetric recording of archaeological features",
"volume": "76",
"author": "Orengo",
"year": "2013",
"journal-title": "ISPRS J. Photogrammetry Remote Sens."
},
{
"key": "10.1016/j.jas.2019.105013_bib26",
"doi-asserted-by": "crossref",
"first-page": "100",
"DOI": "10.1016/j.jas.2015.10.008",
"article-title": "Photogrammetric re-discovery of the Eastern Thessalian hidden long-term landscapes",
"volume": "64",
"author": "Orengo",
"year": "2015",
"journal-title": "J. Archaeol. Sci."
},
{
"issue": "3",
"key": "10.1016/j.jas.2019.105013_bib27",
"doi-asserted-by": "crossref",
"first-page": "479",
"DOI": "10.3764/aja.122.3.0479",
"article-title": "Towards a definition of Minoan agro-pastoral landscapes: results of the survey at Palaikastro (Crete)",
"volume": "122",
"author": "Orengo",
"year": "2018",
"journal-title": "Am. J. Archaeol."
},
{
"issue": "7",
"key": "10.1016/j.jas.2019.105013_bib28",
"doi-asserted-by": "crossref",
"first-page": "735",
"DOI": "10.3390/rs9070735",
"article-title": "Large-scale, multi-temporal remote sensing of palaeo-river networks: a case study from Northwest India and its implications for the Indus civilisation",
"volume": "9",
"author": "Orengo",
"year": "2017",
"journal-title": "Remote Sens."
},
{
"key": "10.1016/j.jas.2019.105013_bib29",
"doi-asserted-by": "crossref",
"first-page": "1361",
"DOI": "10.1002/esp.4317",
"article-title": "Multi-scale relief model (MSRM): a new algorithm for the visualization of subtle topographic change of variable size in digital elevation models",
"volume": "43",
"author": "Orengo",
"year": "2018",
"journal-title": "Earth Surf. Process. Landforms"
},
{
"key": "10.1016/j.jas.2019.105013_bib30",
"series-title": "Submitted to Proceedings of the National Academy of Sciences",
"article-title": "Living on the edge of the desert: automated detection of archaeological mounds in Cholistan (Pakistan) using machine learning classification of multi-sensor multi-temporal satellite data",
"author": "Orengo",
"year": "2019"
},
{
"key": "10.1016/j.jas.2019.105013_bib31",
"first-page": "154",
"article-title": "How many trees in a random forest?",
"volume": "vol. 7376",
"author": "Oshiro",
"year": "2012"
},
{
"key": "10.1016/j.jas.2019.105013_bib32",
"article-title": "Decision-making in modern surveys",
"volume": "ume 1",
"author": "Plog",
"year": "1978"
},
{
"issue": "4",
"key": "10.1016/j.jas.2019.105013_bib33",
"doi-asserted-by": "crossref",
"first-page": "100",
"DOI": "10.3390/geosciences7040100",
"article-title": "From above and on the ground: geospatial methods for recording endangered archaeology in the Middle East and north africa",
"volume": "7",
"author": "Rayne",
"year": "2017",
"journal-title": "Geosciences"
},
{
"issue": "1",
"key": "10.1016/j.jas.2019.105013_bib34",
"doi-asserted-by": "crossref",
"first-page": "1",
"DOI": "10.1080/00438243.1978.9979712",
"article-title": "The design of archaeological surveys",
"volume": "10",
"author": "Schiffer",
"year": "1978",
"journal-title": "World Archaeol."
},
{
"key": "10.1016/j.jas.2019.105013_bib35",
"series-title": "Experiments in the Collection and Analysis of Archaeological Survey Data: the East Hampshire Survey",
"author": "Shennan",
"year": "1985"
},
{
"key": "10.1016/j.jas.2019.105013_bib36",
"doi-asserted-by": "crossref",
"first-page": "1066",
"DOI": "10.1016/j.culher.2016.06.006",
"article-title": "Drones over Mediterranean landscapes. The potential of small UAV's (drones) for site detection and heritage management in archaeological survey projects: a case study from Le Pianelle in the Tappino Valley, Molise (Italy)",
"volume": "22",
"author": "Stek",
"year": "2016",
"journal-title": "J. Cult. Herit."
},
{
"key": "10.1016/j.jas.2019.105013_bib37",
"series-title": "Side-by-Side Survey. Comparative Regional Studies in the Mediterranean World",
"first-page": "65",
"article-title": "Side-by-side and back to front: exploring intra-regional latitudinal and longitudinal comparability in survey data. Three case studies from Metaponto, southern Italy",
"author": "Thomson",
"year": "2004"
},
{
"key": "10.1016/j.jas.2019.105013_bib38",
"series-title": "Digital Discovery. Exploring New Frontiers in Human Heritage. Computer Applications and Quantitative Methods in Archaeology",
"article-title": "Computer vision and machine learning for archaeology",
"author": "van der Maaten",
"year": "2007"
},
{
"key": "10.1016/j.jas.2019.105013_bib39",
"doi-asserted-by": "crossref",
"first-page": "1114",
"DOI": "10.1111/j.1475-4754.2012.00667.x",
"article-title": "Computer vision-based orthophoto mapping of complex archaeological sites: the ancient quarry of Pitaranha (Portugal-Spain)",
"volume": "54",
"author": "Verhoeven",
"year": "2012",
"journal-title": "Archaeometry"
},
{
"key": "10.1016/j.jas.2019.105013_bib40",
"series-title": "A Guide for Salvage Archeology",
"author": "Wendorf",
"year": "1962"
}
],
"container-title": [
"Journal of Archaeological Science"
],
"original-title": [
],
"language": "en",
"link": [
{
"URL": "https://api.elsevier.com/content/article/PII:S0305440319301001?httpAccept=text/xml",
"content-type": "text/xml",
"content-version": "vor",
"intended-application": "text-mining"
},
{
"URL": "https://api.elsevier.com/content/article/PII:S0305440319301001?httpAccept=text/plain",
"content-type": "text/plain",
"content-version": "vor",
"intended-application": "text-mining"
}
],
"deposited": {
"date-parts": [
[
2019,
11,
25
]
],
"date-time": "2019-11-25T06:46:34Z",
"timestamp": 1574664394000
},
"score": 1,
"subtitle": [
],
"short-title": [
],
"issued": {
"date-parts": [
[
2019,
12
]
]
},
"references-count": 39,
"alternative-id": [
"S0305440319301001"
],
"URL": "http://dx.doi.org/10.1016/j.jas.2019.105013",
"relation": {
},
"ISSN": [
"0305-4403"
],
"issn-type": [
{
"value": "0305-4403",
"type": "print"
}
],
"subject": [
"Archaeology",
"Archaeology"
],
"published": {
"date-parts": [
[
2019,
12
]
]
},
"assertion": [
{
"value": "Elsevier",
"name": "publisher",
"label": "This article is maintained by"
},
{
"value": "A brave new world for archaeological survey: Automated machine learning-based potsherd detection using high-resolution drone imagery",
"name": "articletitle",
"label": "Article Title"
},
{
"value": "Journal of Archaeological Science",
"name": "journaltitle",
"label": "Journal Title"
},
{
"value": "https://doi.org/10.1016/j.jas.2019.105013",
"name": "articlelink",
"label": "CrossRef DOI link to publisher maintained version"
},
{
"value": "article",
"name": "content_type",
"label": "Content Type"
},
{
"value": "© 2019 The Authors. Published by Elsevier Ltd.",
"name": "copyright",
"label": "Copyright"
}
],
"article-number": "105013"
}

View File

@ -0,0 +1,107 @@
package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.HdfsSupport
import eu.dnetlib.dhp.schema.common.EntityType
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.slf4j.{Logger, LoggerFactory}
object SparkResolveEntities {
val mapper = new ObjectMapper()
val entities = List(EntityType.dataset,EntityType.publication, EntityType.software, EntityType.otherresearchproduct)
def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_params.json")))
parser.parseArgument(args)
val spark: SparkSession =
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
val graphBasePath = parser.get("graphBasePath")
log.info(s"graphBasePath -> $graphBasePath")
val workingPath = parser.get("workingPath")
log.info(s"workingPath -> $workingPath")
val unresolvedPath = parser.get("unresolvedPath")
log.info(s"unresolvedPath -> $unresolvedPath")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
fs.mkdirs(new Path(workingPath))
resolveEntities(spark, workingPath, unresolvedPath)
generateResolvedEntities(spark, workingPath, graphBasePath)
// TO BE conservative we keep the original entities in the working dir
// and save the resolved entities on the graphBasePath
//In future these lines of code should be removed
entities.foreach {
e =>
fs.rename(new Path(s"$graphBasePath/$e"), new Path(s"$workingPath/${e}_old"))
fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e"))
}
}
def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._
val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)]
val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder))
rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map {
r =>
val result = r._2._2
val dnetId = r._1._1
result.setId(dnetId)
result
}.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities")
}
def deserializeObject(input:String, entity:EntityType ) :Result = {
entity match {
case EntityType.publication => mapper.readValue(input, classOf[Publication])
case EntityType.dataset => mapper.readValue(input, classOf[OafDataset])
case EntityType.software=> mapper.readValue(input, classOf[Software])
case EntityType.otherresearchproduct=> mapper.readValue(input, classOf[OtherResearchProduct])
}
}
def generateResolvedEntities(spark:SparkSession, workingPath: String, graphBasePath:String) = {
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
import spark.implicits._
val re:Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result]
entities.foreach {
e =>
spark.read.text(s"$graphBasePath/$e").as[String]
.map(s => deserializeObject(s, e))
.union(re)
.groupByKey(_.getId)
.reduceGroups {
(x, y) =>
x.mergeFrom(y)
x
}.map(_._2)
.filter(r => r.getClass.getSimpleName.toLowerCase != "result")
.map(r => mapper.writeValueAsString(r))(Encoders.STRING)
.write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingPath/resolvedGraph/$e")
}
}
}

View File

@ -96,6 +96,21 @@ object SparkResolveRelation {
.text(s"$graphBasePath/relation") .text(s"$graphBasePath/relation")
} }
def extractInstanceCF(input: String): List[(String, String)] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val result: List[(String, String)] = for {
JObject(iObj) <- json \ "instance"
JField("collectedfrom", JObject(cf)) <- iObj
JField("instancetype", JObject(instancetype)) <- iObj
JField("value", JString(collectedFrom)) <- cf
JField("classname", JString(classname)) <- instancetype
} yield (classname, collectedFrom)
result
}
def extractPidsFromRecord(input: String): (String, List[(String, String)]) = { def extractPidsFromRecord(input: String): (String, List[(String, String)]) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -108,14 +123,7 @@ object SparkResolveRelation {
JField("classid", JString(pidType)) <- qualifier JField("classid", JString(pidType)) <- qualifier
} yield (pidValue, pidType) } yield (pidValue, pidType)
val alternateIds: List[(String, String)] = for { (id, result)
JObject(pids) <- json \\ "alternateIdentifier"
JField("value", JString(pidValue)) <- pids
JField("qualifier", JObject(qualifier)) <- pids
JField("classid", JString(pidType)) <- qualifier
} yield (pidValue, pidType)
(id, result ::: alternateIds)
} }
@ -128,7 +136,7 @@ object SparkResolveRelation {
source != null source != null
} }
private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = { def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = {
import spark.implicits._ import spark.implicits._
val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*") val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*")

View File

@ -59,7 +59,12 @@ object SparkConvertRDDtoDataset {
log.info("Converting Relation") log.info("Converting Relation")
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation").map(s => mapper.readValue(s, classOf[Relation])).filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50")) val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin")
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation")
.map(s => mapper.readValue(s, classOf[Relation]))
.filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")

View File

@ -4,6 +4,10 @@
<name>graphBasePath</name> <name>graphBasePath</name>
<description>the path of the graph</description> <description>the path of the graph</description>
</property> </property>
<property>
<name>unresolvedPath</name>
<description>the path of the unresolved Entities</description>
</property>
</parameters> </parameters>
<start to="ResolveRelations"/> <start to="ResolveRelations"/>
@ -36,5 +40,33 @@
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="ResolveEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Resolve Relations in raw graph</name>
<class>eu.dnetlib.dhp.oa.graph.resolution.SparkResolveEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=10000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--unresolvedPath</arg><arg>${unresolvedPath}</arg>
<arg>--workingPath</arg><arg>${workingDir}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -0,0 +1,6 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"u", "paramLongName":"unresolvedPath", "paramDescription": "the source Path", "paramRequired": true},
{"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true}
]

View File

@ -0,0 +1,190 @@
package eu.dnetlib.dhp.oa.graph.resolution
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.common.EntityType
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.schema.oaf.{Result, StructuredProperty}
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.TestInstance.Lifecycle
import org.junit.jupiter.api.{AfterAll, BeforeAll, Test, TestInstance}
import java.nio.file.{Files, Path}
import scala.collection.JavaConverters._
import scala.io.Source
@TestInstance(Lifecycle.PER_CLASS)
class ResolveEntitiesTest extends Serializable {
var workingDir:Path = null
val FAKE_TITLE = "FAKETITLE"
val FAKE_SUBJECT = "FAKESUBJECT"
var sparkSession:Option[SparkSession] = None
@BeforeAll
def setUp() :Unit = {
workingDir = Files.createTempDirectory(getClass.getSimpleName)
val conf = new SparkConf()
sparkSession = Some(SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master("local[*]").getOrCreate())
populateDatasets(sparkSession.get)
generateUpdates(sparkSession.get)
}
@AfterAll
def tearDown():Unit = {
FileUtils.deleteDirectory(workingDir.toFile)
sparkSession.get.stop()
}
def generateUpdates(spark:SparkSession):Unit = {
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
val pids:List[String] = template.lines.map{id =>
val r = new Result
r.setId(id.toLowerCase.trim)
r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava)
r
}.map{r =>
val mapper = new ObjectMapper()
mapper.writeValueAsString(r)}.toList
val sc =spark.sparkContext
println(sc.parallelize(pids).count())
spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates")
import spark.implicits._
implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result])
val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper()
mapper.readValue(s, classOf[Result])}.collect()
assertEquals(4, ds.length)
ds.foreach{r => assertNotNull(r.getSubject)}
ds.foreach{r => assertEquals(1,r.getSubject.size())}
ds.foreach{r => assertNotNull(r.getTitle)}
ds.foreach{r => assertEquals(1,r.getTitle.size())}
ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t))
ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t))
println("generated Updates")
}
def populateDatasets(spark:SparkSession):Unit = {
import spark.implicits._
val entities =SparkResolveEntities.entities
entities.foreach{
e =>
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e")
println(s"Created Dataset $e")
}
SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work")
}
@Test
def testResolution():Unit = {
val spark:SparkSession = sparkSession.get
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
val ds = spark.read.load(s"$workingDir/work/resolvedEntities").as[Result]
assertEquals(3, ds.count())
ds.collect().foreach{
r =>
assertTrue(r.getId.startsWith("50"))
}
}
private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = {
l.asScala.exists(p =>p.getValue!= null && p.getValue.equalsIgnoreCase(exptectedValue))
}
@Test
def testUpdate():Unit = {
val spark:SparkSession = sparkSession.get
import spark.implicits._
implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
val m = new ObjectMapper()
SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" )
SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph" )
val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
val datDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset))
val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
val softDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software))
val ts = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct))
val to = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
assertEquals(0, t)
assertEquals(2, td)
assertEquals(1, ts)
assertEquals(0, to)
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,4 @@
unresolved::10.17026/dans-x3z-fsq5::doi
unresolved::10.17026/dans-xsw-qtnx::doi
unresolved::10.5281/zenodo.1473694::doi
unresolved::10.17632/fake::doi

View File

@ -753,7 +753,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.8.22-SNAPSHOT]</dhp-schemas.version> <dhp-schemas.version>[2.8.22]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>