Compare commits

...

17 Commits

Author SHA1 Message Date
Miriam Baglioni 481c4e28d1 [Enrichment single step] changed test for country propagation and fixed inconsistencies 2022-11-24 09:36:10 +01:00
Miriam Baglioni 2e6af7c655 mergin with branch beta 2022-11-23 11:31:35 +01:00
Miriam Baglioni de9d0ace38 [Enrichment single step] modification of workflow ans some change in the classes 2022-11-23 09:54:50 +01:00
Miriam Baglioni b0969461f8 mergin with branch beta 2022-11-22 16:54:23 +01:00
Miriam Baglioni 1e233bedf6 [Enrichment] - 2022-04-19 12:32:22 +02:00
Miriam Baglioni 30e0f60ac8 [Enrichment Step] get rid of hive 2022-04-14 08:50:37 +02:00
Miriam Baglioni 7501e823ed [Enrichment Step] get rid of hive 2022-04-13 17:46:22 +02:00
Miriam Baglioni d205bf78d8 [Enrichment Step] issue of NPE on author should be fixed 2022-04-13 14:39:13 +02:00
Miriam Baglioni 550e1a4e33 [Enrichment Step] issue of NPE on author should be fixed 2022-04-13 14:34:48 +02:00
Miriam Baglioni 8a39a85a5f [Enrichment WF] fixed issue in wf 2022-04-13 12:30:03 +02:00
Miriam Baglioni d1519fa28f [Enrichment Step] get rid of hive 2022-04-13 11:48:03 +02:00
Miriam Baglioni aecea5a095 mergin with branch beta 2022-04-12 12:57:31 +02:00
Miriam Baglioni 1a8641227d [Enrichment Step] get rid of hive 2022-04-12 11:26:48 +02:00
Miriam Baglioni 157e6bf5e1 [Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00
Miriam Baglioni a6c26a9e0e [Enrichment WF] third attempt to make it run in a single step 2022-04-11 09:58:39 +02:00
Miriam Baglioni 9bd5310112 [Enrichment WF] second attempt to make it run in a single step 2022-04-07 18:58:55 +02:00
Miriam Baglioni 7406c88276 [Enrichment WF] first attempt to make it run in a single step 2022-04-06 18:14:05 +02:00
74 changed files with 3645 additions and 1266 deletions

View File

@ -0,0 +1,982 @@
{
"indexed": {
"date-parts": [
[
2022,
4,
5
]
],
"date-time": "2022-04-05T11:07:00Z",
"timestamp": 1649156820730
},
"reference-count": 63,
"publisher": "Public Library of Science (PLoS)",
"issue": "5",
"license": [
{
"start": {
"date-parts": [
[
2020,
5,
29
]
],
"date-time": "2020-05-29T00:00:00Z",
"timestamp": 1590710400000
},
"content-version": "vor",
"delay-in-days": 0,
"URL": "http://creativecommons.org/licenses/by/4.0/"
}
],
"funder": [
{
"DOI": "10.13039/501100001602",
"name": "Science Foundation Ireland",
"doi-asserted-by": "crossref",
"award": [
"SFI/12/RC/2273"
]
},
{
"DOI": "10.13039/501100001602",
"name": "Science Foundation Ireland",
"doi-asserted-by": "crossref",
"award": [
"SFI/12/RC/2273"
]
},
{
"DOI": "10.13039/501100001602",
"name": "Science Foundation Ireland",
"doi-asserted-by": "crossref",
"award": [
"SFI/12/RC/2273"
]
},
{
"DOI": "10.13039/501100001602",
"name": "Science Foundation Ireland",
"doi-asserted-by": "crossref",
"award": [
"SFI/12/RC/2273"
]
}
],
"content-domain": {
"domain": [
"www.plosone.org"
],
"crossmark-restriction": false
},
"short-container-title": [
"PLoS ONE"
],
"DOI": "10.1371/journal.pone.0233284",
"type": "journal-article",
"created": {
"date-parts": [
[
2020,
5,
29
]
],
"date-time": "2020-05-29T17:53:47Z",
"timestamp": 1590774827000
},
"page": "e0233284",
"update-policy": "http://dx.doi.org/10.1371/journal.pone.corrections_policy",
"source": "Crossref",
"is-referenced-by-count": 13,
"title": [
"Vancomycin and nisin A are effective against biofilms of multi-drug resistant Staphylococcus aureus isolates from human milk"
],
"prefix": "10.1371",
"volume": "15",
"author": [
{
"ORCID": "http://orcid.org/0000-0003-4107-0278",
"authenticated-orcid": true,
"given": "Angeliki",
"family": "Angelopoulou",
"sequence": "first",
"affiliation": [
]
},
{
"given": "Des",
"family": "Field",
"sequence": "additional",
"affiliation": [
]
},
{
"given": "Mariana",
"family": "Pérez-Ibarreche",
"sequence": "additional",
"affiliation": [
]
},
{
"ORCID": "http://orcid.org/0000-0001-8317-6455",
"authenticated-orcid": true,
"given": "Alicja K.",
"family": "Warda",
"sequence": "additional",
"affiliation": [
]
},
{
"given": "Colin",
"family": "Hill",
"sequence": "additional",
"affiliation": [
]
},
{
"given": "R. Paul",
"family": "Ross",
"sequence": "additional",
"affiliation": [
]
}
],
"member": "340",
"published-online": {
"date-parts": [
[
2020,
5,
29
]
]
},
"reference": [
{
"issue": "78",
"key": "pone.0233284.ref001",
"doi-asserted-by": "crossref",
"first-page": "509",
"DOI": "10.1016/S0899-9007(00)00363-4",
"article-title": "Breast milk: a truly functional food",
"volume": "16",
"author": "B. Lönnerdal",
"year": "2000",
"journal-title": "Nutrition"
},
{
"key": "pone.0233284.ref002",
"first-page": "1",
"volume-title": "Mastitis: causes and management",
"author": "WHO",
"year": "2000"
},
{
"issue": "12",
"key": "pone.0233284.ref003",
"doi-asserted-by": "crossref",
"first-page": "115",
"DOI": "10.1038/pr.2014.178",
"article-title": "Human milk and infant intestinal mucosal glycans guide succession of the neonatal intestinal microbiota",
"volume": "77",
"author": "DS Newburg",
"year": "2015",
"journal-title": "Pediatr Res"
},
{
"issue": "7",
"key": "pone.0233284.ref004",
"doi-asserted-by": "crossref",
"first-page": "647",
"DOI": "10.1001/jamapediatrics.2017.0378",
"article-title": "Association between breast milk bacterial communities and establishment and development of the infant gut microbiome",
"volume": "171",
"author": "PS Pannaraj",
"year": "2017",
"journal-title": "JAMA Pediatr"
},
{
"issue": "Suppl 2:",
"key": "pone.0233284.ref005",
"doi-asserted-by": "crossref",
"first-page": "S69",
"DOI": "10.1542/peds.2008-1315i",
"article-title": "Why mothers stop breastfeeding: mothers' self-reported reasons for stopping during the first year",
"volume": "122",
"author": "R Li",
"year": "2008",
"journal-title": "Pediatrics"
},
{
"issue": "2",
"key": "pone.0233284.ref006",
"doi-asserted-by": "crossref",
"first-page": "83",
"DOI": "10.1007/s00430-017-0532-z",
"article-title": "The microbiology and treatment of human mastitis",
"volume": "207",
"author": "A Angelopoulou",
"year": "2018",
"journal-title": "Med Microbiol Immunol"
},
{
"issue": "2",
"key": "pone.0233284.ref007",
"doi-asserted-by": "crossref",
"first-page": "169",
"DOI": "10.3920/BM2013.0036",
"article-title": "Probiotics for human lactational mastitis",
"volume": "5",
"author": "L Fernández",
"year": "2014",
"journal-title": "Benef Microbes"
},
{
"issue": "15",
"key": "pone.0233284.ref008",
"doi-asserted-by": "crossref",
"first-page": "4650",
"DOI": "10.1128/AEM.02599-07",
"article-title": "Oral administration of Lactobacillus strains isolated from breast milk as an alternative for the treatment of infectious mastitis during lactation",
"volume": "74",
"author": "E Jiménez",
"year": "2008",
"journal-title": "Appl Environ Microbiol"
},
{
"issue": "3",
"key": "pone.0233284.ref009",
"doi-asserted-by": "crossref",
"first-page": "406",
"DOI": "10.1177/0890334415585078",
"article-title": "Metagenomic analysis of milk of healthy and mastitis-suffering women",
"volume": "31",
"author": "E Jiménez",
"year": "2015",
"journal-title": "J Hum Lact"
},
{
"issue": "2",
"key": "pone.0233284.ref010",
"doi-asserted-by": "crossref",
"first-page": "176",
"DOI": "10.1086/589241",
"article-title": "Risk of infection and death due to methicillin-resistant Staphylococcus aureus in long-term carriers",
"volume": "47",
"author": "R Datta",
"year": "2008",
"journal-title": "Clin Infect Dis"
},
{
"issue": "4",
"key": "pone.0233284.ref011",
"doi-asserted-by": "crossref",
"DOI": "10.1128/microbiolspec.GPP3-0023-2018",
"article-title": "Staphylococcal biofilms",
"volume": "6",
"author": "M. Otto",
"year": "2018",
"journal-title": "Microbiol Spectr"
},
{
"issue": "1",
"key": "pone.0233284.ref012",
"doi-asserted-by": "crossref",
"first-page": "9",
"DOI": "10.1038/s41522-018-0053-6",
"article-title": "Fighting biofilms with lantibiotics and other groups of bacteriocins",
"volume": "4",
"author": "H Mathur",
"year": "2018",
"journal-title": "NPJ Biofilms Microbiomes"
},
{
"issue": "2",
"key": "pone.0233284.ref013",
"doi-asserted-by": "crossref",
"first-page": "310",
"DOI": "10.1128/MMBR.00041-08",
"article-title": "Signals, regulatory networks, and materials that build and break bacterial biofilms",
"volume": "73",
"author": "E Karatan",
"year": "2009",
"journal-title": "Microbiol Mol Biol Rev"
},
{
"issue": "3",
"key": "pone.0233284.ref014",
"doi-asserted-by": "crossref",
"first-page": "147",
"DOI": "10.1016/S1473-3099(01)00091-3",
"article-title": "Vancomycin-resistant Staphylococcus aureus: a new model of antibiotic resistance",
"volume": "1",
"author": "K Hiramatsu",
"year": "2001",
"journal-title": "Lancet Infect Dis"
},
{
"key": "pone.0233284.ref015",
"doi-asserted-by": "crossref",
"first-page": "339",
"DOI": "10.1146/annurev.mi.38.100184.002011",
"article-title": "The structure and mode of action of glycopeptide antibiotics of the vancomycin group",
"volume": "38",
"author": "JC Barna",
"year": "1984",
"journal-title": "Annu Rev Microbiol"
},
{
"key": "pone.0233284.ref016",
"unstructured": "Health Service Executive Mastitis Factsheet for Health Care Professionals. Available at: https://www.breastfeeding.ie/Uploads/Mastitis.pdf"
},
{
"issue": "3",
"key": "pone.0233284.ref017",
"first-page": "136",
"article-title": "Breast infection: a review of diagnosis and management practices",
"volume": "14",
"author": "E Boakes",
"year": "2018",
"journal-title": "Eur J Breast Health"
},
{
"key": "pone.0233284.ref018",
"doi-asserted-by": "crossref",
"first-page": "1205",
"DOI": "10.3389/fmicb.2017.01205",
"article-title": "Bacteriocin-antimicrobial synergy: A medical and food perspective",
"volume": "8",
"author": "H Mathur",
"year": "2017",
"journal-title": "Front Microbiol"
},
{
"issue": "1",
"key": "pone.0233284.ref019",
"doi-asserted-by": "crossref",
"first-page": "223",
"DOI": "10.1016/S0005-2736(99)00208-4",
"article-title": "The lantibiotic nisin, a special case or not?",
"volume": "1462",
"author": "E Breukink",
"year": "1999",
"journal-title": "Biochim Biophys Acta"
},
{
"issue": "10",
"key": "pone.0233284.ref020",
"doi-asserted-by": "crossref",
"first-page": "963",
"DOI": "10.1038/nsmb830",
"article-title": "The nisin-lipid II complex reveals a pyrophosphate cage that provides a blueprint for novel antibiotics",
"volume": "11",
"author": "ST Hsu",
"year": "2004",
"journal-title": "Nat Struct Mol Biol"
},
{
"issue": "5793",
"key": "pone.0233284.ref021",
"doi-asserted-by": "crossref",
"first-page": "1636",
"DOI": "10.1126/science.1129818",
"article-title": "An alternative bactericidal mechanism of action for lantibiotic peptides that target lipid II",
"volume": "313",
"author": "HE Hasper",
"year": "2006",
"journal-title": "Science"
},
{
"issue": "3",
"key": "pone.0233284.ref022",
"doi-asserted-by": "crossref",
"first-page": "1772",
"DOI": "10.1074/jbc.M006770200",
"article-title": "Specific binding of nisin to the peptidoglycan precursor lipid II combines pore formation and inhibition of cell wall biosynthesis for potent antibiotic activity",
"volume": "276",
"author": "I Wiedemann",
"year": "2001",
"journal-title": "J Biol Chem"
},
{
"key": "pone.0233284.ref023",
"doi-asserted-by": "crossref",
"first-page": "104539",
"DOI": "10.1016/j.idairyj.2019.104539",
"article-title": "Bovine mastitis is a polymicrobial disease requiring a polydiagnostic approach",
"volume": "99",
"author": "A Angelopoulou",
"year": "2019",
"journal-title": "Int Dairy J"
},
{
"issue": "4",
"key": "pone.0233284.ref024",
"doi-asserted-by": "crossref",
"first-page": "493",
"DOI": "10.1093/ajcp/45.4_ts.493",
"article-title": "Antibiotic susceptibility testing by a standardized single disk method",
"volume": "45",
"author": "AW Bauer",
"year": "1966",
"journal-title": "Am Journal Clin Pathol"
},
{
"key": "pone.0233284.ref025",
"unstructured": "v_9.0_Breakpoint_Tables.pdf. Available at: http://www.eucast.org/fileadmin/src/media/PDFs/EUCAST_files/Breakpoint_tables/v_9.0_Breakpoint_Tables.pdf (Accessed: 28th July 2019)."
},
{
"issue": "12",
"key": "pone.0233284.ref026",
"doi-asserted-by": "crossref",
"first-page": "67",
"DOI": "10.1016/j.vetmic.2010.05.044",
"article-title": "(GTG)5-PCR fingerprinting for the classification and identification of coagulase-negative Staphylococcus species from bovine milk and teat apices: a comparison of type strains and field isolates",
"volume": "147",
"author": "G Braem",
"year": "2011",
"journal-title": "Vet Microbiol"
},
{
"key": "pone.0233284.ref027",
"doi-asserted-by": "crossref",
"first-page": "270",
"DOI": "10.1186/s12859-015-0703-0",
"article-title": "GelJ a tool for analyzing DNA fingerprint gel images",
"volume": "16",
"author": "J Heras",
"year": "2015",
"journal-title": "BMC bioinformatics"
},
{
"issue": "4",
"key": "pone.0233284.ref028",
"doi-asserted-by": "crossref",
"first-page": "473",
"DOI": "10.1111/j.1751-7915.2010.00184.x",
"article-title": "Studies with bioengineered nisin peptides highlight the broad-spectrum potency of nisin V",
"volume": "3",
"author": "D Field",
"year": "2010",
"journal-title": "Microb Biotechnol"
},
{
"issue": "11",
"key": "pone.0233284.ref029",
"doi-asserted-by": "crossref",
"first-page": "e79563",
"DOI": "10.1371/journal.pone.0079563",
"article-title": "Intensive mutagenesis of the nisin hinge leads to the rational design of enhanced derivatives",
"volume": "8",
"author": "B Healy",
"year": "2013",
"journal-title": "PLoS One"
},
{
"issue": "10",
"key": "pone.0233284.ref030",
"doi-asserted-by": "crossref",
"first-page": "e46884",
"DOI": "10.1371/journal.pone.0046884",
"article-title": "Bioengineered nisin A derivatives with enhanced activity against both Gram positive and Gram negative pathogens",
"volume": "7",
"author": "D Field",
"year": "2012",
"journal-title": "PLoS One"
},
{
"issue": "3",
"key": "pone.0233284.ref031",
"doi-asserted-by": "crossref",
"first-page": "e0119684",
"DOI": "10.1371/journal.pone.0119684",
"article-title": "A Bioengineered nisin derivative to control biofilms of Staphylococcus pseudintermedius",
"volume": "10",
"author": "D Field",
"year": "2015a",
"journal-title": "PLoS One"
},
{
"issue": "2",
"key": "pone.0233284.ref032",
"doi-asserted-by": "crossref",
"first-page": "207",
"DOI": "10.1007/s13765-012-3253-4",
"article-title": "Biofilm formation, attachment, and cell hydrophobicity of foodborne pathogens under varied environmental conditions",
"volume": "56",
"author": "NY Choi",
"year": "2013",
"journal-title": "J Korean Soc Appl Biol Chem"
},
{
"issue": "2",
"key": "pone.0233284.ref033",
"doi-asserted-by": "crossref",
"first-page": "175",
"DOI": "10.1016/S0167-7012(00)00122-6",
"article-title": "A modified microtiter-plate test for quantification of staphylococcal biofilm formation",
"volume": "40",
"author": "S Stepanović",
"year": "2000",
"journal-title": "J Microbiol Methods"
},
{
"issue": "2",
"key": "pone.0233284.ref034",
"doi-asserted-by": "crossref",
"first-page": "225",
"DOI": "10.1111/j.1574-695X.2011.00806.x",
"article-title": "Characterization of Staphylococcus aureus strains involved in human and bovine mastitis",
"volume": "62",
"author": "S Delgado",
"year": "2011",
"journal-title": "FEMS Immunol Med Microbiol"
},
{
"key": "pone.0233284.ref035",
"doi-asserted-by": "crossref",
"first-page": "53",
"DOI": "10.2174/1874285801711010053",
"article-title": "Understanding the mechanism of bacterial biofilms resistance to antimicrobial agents",
"volume": "11",
"author": "S Singh",
"year": "2017",
"journal-title": "Open Microbiol J"
},
{
"issue": "1",
"key": "pone.0233284.ref036",
"doi-asserted-by": "crossref",
"first-page": "61",
"DOI": "10.2174/1389203053027584",
"article-title": "Bacterial lantibiotics: strategies to improve therapeutic potential",
"volume": "6",
"author": "PD Cotter",
"year": "2005",
"journal-title": "Curr Protein Pept Sci"
},
{
"issue": "5",
"key": "pone.0233284.ref037",
"doi-asserted-by": "crossref",
"first-page": "494",
"DOI": "10.1016/j.ijantimicag.2015.07.011",
"article-title": "Bacteriocins and their position in the next wave of conventional antibiotics",
"volume": "46",
"author": "VL Cavera",
"year": "2015",
"journal-title": "Int J Antimicrob Agents"
},
{
"key": "pone.0233284.ref038",
"doi-asserted-by": "crossref",
"first-page": "1363",
"DOI": "10.3389/fmicb.2015.01363",
"article-title": "Bioengineering lantibiotics for therapeutic success",
"volume": "6",
"author": "D Field",
"year": "2015b",
"journal-title": "Front Microbiol"
},
{
"issue": "11",
"key": "pone.0233284.ref039",
"doi-asserted-by": "crossref",
"first-page": "5572",
"DOI": "10.1128/AAC.00888-13",
"article-title": "Effects of bacteriocins on methicillin-resistant Staphylococcus aureus biofilm",
"volume": "57",
"author": "K Okuda",
"year": "2013",
"journal-title": "Antimicrob Agents Chemother"
},
{
"issue": "6",
"key": "pone.0233284.ref040",
"doi-asserted-by": "crossref",
"first-page": "511",
"DOI": "10.1159/000335598",
"article-title": "In vitro activities of nisin alone or in combination with vancomycin and ciprofloxacin against methicillin-resistant and methicillin-susceptible Staphylococcus aureus strains",
"volume": "57",
"author": "S Dosler",
"year": "2011",
"journal-title": "Chemotherapy"
},
{
"issue": "18",
"key": "pone.0233284.ref041",
"doi-asserted-by": "crossref",
"first-page": "5809",
"DOI": "10.1128/AEM.01104-07",
"article-title": "Dissection and modulation of the four distinct activities of nisin by mutagenesis of rings A and B and by C-terminal truncation",
"volume": "73",
"author": "R Rink",
"year": "2007",
"journal-title": "Appl Environ Microbiol"
},
{
"issue": "6",
"key": "pone.0233284.ref042",
"doi-asserted-by": "crossref",
"first-page": "806",
"DOI": "10.1007/s00253-004-1599-1",
"article-title": "Site-directed mutagenesis of the hinge region of nisinZ and properties of nisinZ mutants",
"volume": "64",
"author": "J Yuan",
"year": "2004",
"journal-title": "Appl Microbiol Biotechnol"
},
{
"key": "pone.0233284.ref043",
"doi-asserted-by": "crossref",
"first-page": "508",
"DOI": "10.3389/fmicb.2016.00508",
"article-title": "In vitro activities of nisin and nisin derivatives alone and in combination with antibiotics against Staphylococcus biofilms",
"volume": "7",
"author": "D Field",
"year": "2016",
"journal-title": "Front Microbiol"
},
{
"issue": "18",
"key": "pone.0233284.ref044",
"doi-asserted-by": "crossref",
"first-page": "1573",
"DOI": "10.2217/fmb-2019-0153",
"article-title": "Nisin Z and lacticin 3147 improve efficacy of antibiotics against clinically significant bacteria",
"volume": "14",
"author": "JC Ellis",
"year": "2020",
"journal-title": "Future Microbiol"
},
{
"issue": "3",
"key": "pone.0233284.ref045",
"doi-asserted-by": "crossref",
"first-page": "311",
"DOI": "10.1177/0890334408317435",
"article-title": "The bacteriocin nisin, an effective agent for the treatment of staphylococcal mastitis during lactation",
"volume": "24",
"author": "L Fernández",
"year": "2008",
"journal-title": "J Hum Lact"
},
{
"issue": "1",
"key": "pone.0233284.ref046",
"doi-asserted-by": "crossref",
"first-page": "33",
"DOI": "10.1159/000272223",
"article-title": "Inflammatory breast diseases during lactation: milk stasis, puerperal mastitis, abscesses of the breast, and malignant tumorscurrent and evidence-based strategies for diagnosis and therapy",
"volume": "5",
"author": "M Abou-Dakn",
"year": "2010",
"journal-title": "Breast Care"
},
{
"issue": "6",
"key": "pone.0233284.ref047",
"doi-asserted-by": "crossref",
"first-page": "430",
"DOI": "10.1007/s12262-012-0776-1",
"article-title": "Management of lactational mastitis and breast abscesses: review of current knowledge and practice",
"volume": "75",
"author": "K Kataria",
"year": "2013",
"journal-title": "Indian J Surg"
},
{
"issue": "2",
"key": "pone.0233284.ref048",
"doi-asserted-by": "crossref",
"first-page": "77",
"DOI": "10.1016/j.micres.2012.09.004",
"article-title": "Genotypic diversity and virulent factors of Staphylococcus epidermidis isolated from human breast milk",
"volume": "168",
"author": "J Begović",
"year": "2013",
"journal-title": "Microbiol Res"
},
{
"issue": "Pt 8",
"key": "pone.0233284.ref049",
"doi-asserted-by": "crossref",
"first-page": "761",
"DOI": "10.1099/jmm.0.05453-0",
"article-title": "Antimicrobial-resistance and enterotoxin-encoding genes among staphylococci isolated from expressed human breast milk",
"volume": "53",
"author": "LA Carneiro",
"year": "2004",
"journal-title": "J Med Microbiol"
},
{
"issue": "2",
"key": "pone.0233284.ref050",
"doi-asserted-by": "crossref",
"first-page": "113",
"DOI": "10.1007/s00284-015-0925-4",
"article-title": "Antibiotic susceptibility of commensal bacteria from human milk",
"volume": "72",
"author": "PW Chen",
"year": "2016",
"journal-title": "Curr Microbiol"
},
{
"key": "pone.0233284.ref051",
"doi-asserted-by": "crossref",
"first-page": "2512",
"DOI": "10.3389/fmicb.2018.02512",
"article-title": "Microbial community dynamics in mother's milk and infant's mouth and gut in moderately preterm infants",
"volume": "9",
"author": "E Biagi",
"year": "2018",
"journal-title": "Front Microbiol"
},
{
"key": "pone.0233284.ref052",
"doi-asserted-by": "crossref",
"first-page": "4",
"DOI": "10.3410/M4-4",
"article-title": "Reduced vancomycin susceptibility among clinical Staphylococcus aureus isolates ('the MIC Creep'): implications for therapy",
"volume": "4",
"author": "A Dhand",
"year": "2012",
"journal-title": "F1000 Med Rep"
},
{
"issue": "12",
"key": "pone.0233284.ref053",
"doi-asserted-by": "crossref",
"first-page": "1112",
"DOI": "10.1136/jcp.2009.069021",
"article-title": "Low concentrations of vancomycin stimulate biofilm formation in some clinical isolates of Staphylococcus epidermidis",
"volume": "62",
"author": "JS Cargill",
"year": "2009",
"journal-title": "J Clin Pathol"
},
{
"issue": "2",
"key": "pone.0233284.ref054",
"doi-asserted-by": "crossref",
"first-page": "191",
"DOI": "10.1002/jobm.201000221",
"article-title": "Effect of sub-lethal doses of vancomycin and oxacillin on biofilm formation by vancomycin intermediate resistant Staphylococcus aureus",
"volume": "51",
"author": "ZA Mirani",
"year": "2011",
"journal-title": "J Basic Microbiol"
},
{
"key": "pone.0233284.ref055",
"doi-asserted-by": "crossref",
"first-page": "225",
"DOI": "10.1016/j.micpath.2017.07.004",
"article-title": "Vancomycin-induced biofilm formation by methicillin-resistant Staphylococcus aureus is associated with the secretion of membrane vesicles",
"volume": "110",
"author": "X He",
"year": "2017",
"journal-title": "Microb Pathog"
},
{
"issue": "9",
"key": "pone.0233284.ref056",
"doi-asserted-by": "crossref",
"first-page": "1627",
"DOI": "10.4315/0362-028X.JFP-12-001",
"article-title": "Effects of nisin and lysozyme on growth inhibition and biofilm formation capacity of Staphylococcus aureus strains isolated from raw milk and cheese samples",
"volume": "75",
"author": "M Sudagidan",
"year": "2012",
"journal-title": "J Food Prot"
},
{
"issue": "3",
"key": "pone.0233284.ref057",
"doi-asserted-by": "crossref",
"first-page": "253",
"DOI": "10.1016/j.ijfoodmicro.2008.01.011",
"article-title": "Nisin-bacteriophage cross-resistance in Staphylococcus aureus",
"volume": "122",
"author": "B Martinez",
"year": "2008",
"journal-title": "Int J Food Microbiol"
},
{
"issue": "1",
"key": "pone.0233284.ref058",
"doi-asserted-by": "crossref",
"first-page": "82",
"DOI": "10.2146/ajhp080434",
"article-title": "Therapeutic monitoring of vancomycin in adult patients: a consensus review of the american society of health-system pharmacists, the infectious diseases society of america, and the society of infectious diseases pharmacists",
"volume": "66",
"author": "M Rybak",
"year": "2009",
"journal-title": "Am J Health Syst Pharm"
},
{
"issue": "2",
"key": "pone.0233284.ref059",
"doi-asserted-by": "crossref",
"first-page": "277",
"DOI": "10.1111/j.1574-695X.2007.00300.x",
"article-title": "Increased tolerance of Staphylococcus aureus to vancomycin in viscous media",
"volume": "51",
"author": "V Kostenko",
"year": "2007",
"journal-title": "FEMS Immunol Med Microbiol"
},
{
"key": "pone.0233284.ref060",
"first-page": "107",
"article-title": "Multidrug tolerance of biofilms and persister cells",
"volume": "322",
"author": "K. Lewis",
"year": "2008",
"journal-title": "Curr Top Microbiol Immunol"
},
{
"issue": "6",
"key": "pone.0233284.ref061",
"doi-asserted-by": "crossref",
"first-page": "ftw056",
"DOI": "10.1093/femspd/ftw056",
"article-title": "Penetration barrier contributes to bacterial biofilm-associated resistance against only select antibiotics, and exhibits genus-, strain- and antibiotic-specific differences",
"volume": "74",
"author": "R Singh",
"year": "2016",
"journal-title": "Pathog Dis"
},
{
"issue": "12",
"key": "pone.0233284.ref062",
"doi-asserted-by": "crossref",
"first-page": "7273",
"DOI": "10.1128/AAC.03132-14",
"article-title": "Extracellular DNA impedes the transport of vancomycin in Staphylococcus epidermidis biofilms preexposed to subinhibitory concentrations of vancomycin",
"volume": "58",
"author": "N Doroshenko",
"year": "2014",
"journal-title": "Antimicrob Agents Chemotherapy"
},
{
"issue": "1",
"key": "pone.0233284.ref063",
"doi-asserted-by": "crossref",
"first-page": "46",
"DOI": "10.1007/s00776-005-0968-7",
"article-title": "Antimicrobial susceptibility of Staphylococcus aureus and Staphylococcus epidermidis biofilms isolated from infected total hip arthroplasty cases",
"volume": "11",
"author": "S Nishimura",
"year": "2006",
"journal-title": "J Orthop Sci"
}
],
"container-title": [
"PLOS ONE"
],
"original-title": [
],
"language": "en",
"link": [
{
"URL": "https://dx.plos.org/10.1371/journal.pone.0233284",
"content-type": "unspecified",
"content-version": "vor",
"intended-application": "similarity-checking"
}
],
"deposited": {
"date-parts": [
[
2020,
5,
29
]
],
"date-time": "2020-05-29T17:54:37Z",
"timestamp": 1590774877000
},
"score": 1,
"resource": {
"primary": {
"URL": "https://dx.plos.org/10.1371/journal.pone.0233284"
}
},
"subtitle": [
],
"editor": [
{
"given": "Rita G.",
"family": "Sobral",
"sequence": "first",
"affiliation": [
]
}
],
"short-title": [
],
"issued": {
"date-parts": [
[
2020,
5,
29
]
]
},
"references-count": 63,
"journal-issue": {
"issue": "5",
"published-online": {
"date-parts": [
[
2020,
5,
29
]
]
}
},
"URL": "http://dx.doi.org/10.1371/journal.pone.0233284",
"relation": {
},
"ISSN": [
"1932-6203"
],
"issn-type": [
{
"value": "1932-6203",
"type": "electronic"
}
],
"subject": [
"Multidisciplinary"
],
"published": {
"date-parts": [
[
2020,
5,
29
]
]
}
}

View File

@ -475,6 +475,86 @@ class CrossrefMappingTest {
} }
@Test
def testConvertArticleFromCrossRef2OafSFI(): Unit = {
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/sfi_funded_article.json"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Publication])
assert(items.nonEmpty)
assert(items.size == 1)
val result: Result = items.head.asInstanceOf[Publication]
assertNotNull(result)
logger.info(mapper.writeValueAsString(result));
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala
assert(
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
"Wrong collected from assertion"
)
assert(
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
"Wrong collected from assertion"
)
val relevantDates = result.getRelevantdate.asScala
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
"Missing relevant date of type created"
)
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
assertFalse(rels.isEmpty)
rels.foreach(relation => {
assertNotNull(relation)
assertFalse(relation.getSource.isEmpty)
assertFalse(relation.getTarget.isEmpty)
assertFalse(relation.getRelClass.isEmpty)
assertFalse(relation.getRelType.isEmpty)
assertFalse(relation.getSubRelType.isEmpty)
})
}
@Test @Test
def testConvertFromCrossRef2OafIssue(): Unit = { def testConvertFromCrossRef2OafIssue(): Unit = {
val json = Source val json = Source

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.countrypropagation; package eu.dnetlib.dhp;
import java.io.Serializable; import java.io.Serializable;

View File

@ -23,4 +23,5 @@ public class KeyValueSet implements Serializable {
public void setValueSet(ArrayList<String> valueSet) { public void setValueSet(ArrayList<String> valueSet) {
this.valueSet = valueSet; this.valueSet = valueSet;
} }
} }

View File

@ -4,22 +4,21 @@ package eu.dnetlib.dhp;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Country; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PropagationConstant { public class PropagationConstant {
@ -221,9 +220,28 @@ public class PropagationConstant {
.orElse(Boolean.FALSE); .orElse(Boolean.FALSE);
} }
public static void createCfHbforResult(SparkSession spark) { // of the results collects the distinct keys for collected from (at the level of the result) and hosted by
org.apache.spark.sql.Dataset<Row> cfhb = spark.sql(cfHbforResultQuery); // and produces pairs resultId, key for each distinct key associated to the result
cfhb.createOrReplaceTempView("cfhb"); public static <R extends Result> void createCfHbforResult(SparkSession spark, String inputPath, String outputPath,
Class<R> resultClazz) {
readPath(spark, inputPath, resultClazz)
.filter(
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible())
.flatMap((FlatMapFunction<R, EntityEntityRel>) r -> {
Set<String> cfhb = r.getCollectedfrom().stream().map(cf -> cf.getKey()).collect(Collectors.toSet());
cfhb.addAll(r.getInstance().stream().map(i -> i.getHostedby().getKey()).collect(Collectors.toSet()));
return cfhb
.stream()
.map(value -> EntityEntityRel.newInstance(r.getId(), value))
.collect(Collectors.toList())
.iterator();
}, Encoders.bean(EntityEntityRel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
} }
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(

View File

@ -64,12 +64,6 @@ public class SparkBulkTagJob {
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -86,10 +80,9 @@ public class SparkBulkTagJob {
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark ->
removeOutputDir(spark, outputPath); execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc)
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc); );
});
} }
private static <R extends Result> void execBulkTag( private static <R extends Result> void execBulkTag(
@ -113,6 +106,13 @@ public class SparkBulkTagJob {
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
readPath(spark, outputPath, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
} }
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(

View File

@ -16,6 +16,7 @@ import javax.print.attribute.DocAttributeSet;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -34,6 +35,7 @@ import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;
/** /**
* @author miriam.baglioni * @author miriam.baglioni
@ -44,6 +46,11 @@ public class SparkEoscBulkTag implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkEoscBulkTag.class); private static final Logger log = LoggerFactory.getLogger(SparkEoscBulkTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static String OPENAIRE_3 = "openaire3.0";
private static String OPENAIRE_4 = "openaire-pub_4.0";
private static String OPENAIRE_CRIS = "openaire-cris_1.1";
private static String OPENAIRE_DATA = "openaire2.0_data";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
@ -72,6 +79,9 @@ public class SparkEoscBulkTag implements Serializable {
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final String resultType = parser.get("resultType");
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -82,41 +92,71 @@ public class SparkEoscBulkTag implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, workingPath); removeOutputDir(spark, workingPath);
execBulkTag(spark, inputPath, workingPath, datasourceMapPath, resultClazz); selectCompliantDatasources(spark, inputPath, workingPath, datasourceMapPath);
execBulkTag(spark, inputPath, workingPath, resultType, resultClazz);
}); });
} }
private static void selectCompliantDatasources(SparkSession spark, String inputPath, String workingPath,
String datasourceMapPath) {
Dataset<Datasource> datasources = readPath(spark, inputPath + "datasource", Datasource.class)
.filter((FilterFunction<Datasource>) ds -> {
final String compatibility = ds.getOpenairecompatibility().getClassid();
return compatibility.equalsIgnoreCase(OPENAIRE_3) ||
compatibility.equalsIgnoreCase(OPENAIRE_4) ||
compatibility.equalsIgnoreCase(OPENAIRE_CRIS) ||
compatibility.equalsIgnoreCase(OPENAIRE_DATA);
});
Dataset<DatasourceMaster> datasourceMaster = readPath(spark, datasourceMapPath, DatasourceMaster.class);
datasources
.joinWith(datasourceMaster, datasources.col("id").equalTo(datasourceMaster.col("master")), "left")
.map(
(MapFunction<Tuple2<Datasource, DatasourceMaster>, DatasourceMaster>) t2 -> t2._2(),
Encoders.bean(DatasourceMaster.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "datasource");
}
private static <R extends Result> void execBulkTag( private static <R extends Result> void execBulkTag(
SparkSession spark, SparkSession spark,
String inputPath, String inputPath,
String workingPath, String workingPath,
String datasourceMapPath, String resultType,
Class<R> resultClazz) { Class<R> resultClazz) {
List<String> hostedByList = readPath(spark, datasourceMapPath, DatasourceMaster.class) List<String> hostedByList = readPath(spark, workingPath + "datasource", DatasourceMaster.class)
.map((MapFunction<DatasourceMaster, String>) dm -> dm.getMaster(), Encoders.STRING()) .map((MapFunction<DatasourceMaster, String>) dm -> dm.getMaster(), Encoders.STRING())
.collectAsList(); .collectAsList();
readPath(spark, inputPath, resultClazz) readPath(spark, inputPath + resultType, resultClazz)
.map(patchResult(), Encoders.bean(resultClazz))
.filter(Objects::nonNull)
.map( .map(
(MapFunction<R, R>) value -> enrich(value, hostedByList), (MapFunction<R, R>) value -> enrich(value, hostedByList),
Encoders.bean(resultClazz)) Encoders.bean(resultClazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(workingPath); .json(workingPath + resultType);
readPath(spark, workingPath, resultClazz) readPath(spark, workingPath + resultType, resultClazz)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(inputPath); .json(inputPath + resultType);
} }
private static <R extends Result> R enrich(R value, List<String> hostedByList) { private static <R extends Result> R enrich(R value, List<String> hostedByList) {
if (value.getDataInfo().getDeletedbyinference() == null) {
value.getDataInfo().setDeletedbyinference(false);
}
if (value.getContext() == null) {
value.setContext(new ArrayList<>());
}
if (value if (value
.getInstance() .getInstance()
.stream() .stream()

View File

@ -8,10 +8,11 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
@ -20,8 +21,7 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.EntityEntityRel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Datasource;
@ -57,8 +57,8 @@ public class PrepareDatasourceCountryAssociation {
String inputPath = parser.get("sourcePath"); String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath"); final String workingPath = parser.get("workingPath");
log.info("outputPath {}: ", outputPath); log.info("workingPath {}: ", workingPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -66,13 +66,13 @@ public class PrepareDatasourceCountryAssociation {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, workingPath + "/datasourceCountry");
prepareDatasourceCountryAssociation( prepareDatasourceCountryAssociation(
spark, spark,
Arrays.asList(parser.get("whitelist").split(";")), Arrays.asList(parser.get("whitelist").split(";")),
Arrays.asList(parser.get("allowedtypes").split(";")), Arrays.asList(parser.get("allowedtypes").split(";")),
inputPath, inputPath,
outputPath); workingPath + "/datasourceCountry");
}); });
} }

View File

@ -2,20 +2,19 @@
package eu.dnetlib.dhp.countrypropagation; package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
@ -23,6 +22,8 @@ import org.apache.spark.sql.Dataset;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.EntityEntityRel;
import eu.dnetlib.dhp.PropagationConstant;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2; import scala.Tuple2;
@ -45,19 +46,18 @@ public class PrepareResultCountrySet {
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String workingPath = parser.get("workingPath"); String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
String inputPath = parser.get("sourcePath"); String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String datasourcecountrypath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", datasourcecountrypath);
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -66,50 +66,31 @@ public class PrepareResultCountrySet {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, workingPath + "/preparedInfo/" + resultType);
getPotentialResultToUpdate( getPotentialResultToUpdate(
spark, spark,
inputPath, inputPath,
outputPath,
datasourcecountrypath,
workingPath, workingPath,
resultClazz); resultType,
resultClazz);
}); });
} }
private static <R extends Result> void getPotentialResultToUpdate( private static <R extends Result> void getPotentialResultToUpdate(
SparkSession spark, SparkSession spark,
String inputPath, String inputPath,
String outputPath,
String datasourcecountrypath,
String workingPath, String workingPath,
String resultType,
Class<R> resultClazz) { Class<R> resultClazz) {
// selects all the results non deleted by inference and non invisible final String datasourcecountrypath = workingPath + "/datasourceCountry";
Dataset<R> result = readPath(spark, inputPath, resultClazz) final String cfhbpath = workingPath + "/resultCfHb/" + resultType;
.filter( final String outputPath = workingPath + "/preparedInfo/" + resultType;
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible());
// of the results collects the distinct keys for collected from (at the level of the result) and hosted by PropagationConstant.createCfHbforResult(spark, inputPath, cfhbpath, resultClazz);
// and produces pairs resultId, key for each distinct key associated to the result
result.flatMap((FlatMapFunction<R, EntityEntityRel>) r -> {
Set<String> cfhb = r.getCollectedfrom().stream().map(cf -> cf.getKey()).collect(Collectors.toSet());
cfhb.addAll(r.getInstance().stream().map(i -> i.getHostedby().getKey()).collect(Collectors.toSet()));
return cfhb
.stream()
.map(value -> EntityEntityRel.newInstance(r.getId(), value))
.collect(Collectors.toList())
.iterator();
}, Encoders.bean(EntityEntityRel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/resultCfHb");
Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class); Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
Dataset<EntityEntityRel> cfhb = readPath(spark, cfhbpath, EntityEntityRel.class);
Dataset<EntityEntityRel> cfhb = readPath(spark, workingPath + "/resultCfHb", EntityEntityRel.class);
datasource_country datasource_country
.joinWith( .joinWith(

View File

@ -9,6 +9,8 @@ import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
@ -47,15 +49,17 @@ public class SparkCountryPropagationJob {
String sourcePath = parser.get("sourcePath"); String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath); log.info("sourcePath: {}", sourcePath);
String preparedInfoPath = parser.get("preparedInfoPath"); String workingPath = parser.get("workingPath");
log.info("preparedInfoPath: {}", preparedInfoPath); log.info("workingPath: {}", workingPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -63,12 +67,12 @@ public class SparkCountryPropagationJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, workingPath + "/" + resultType);
execPropagation( execPropagation(
spark, spark,
sourcePath, sourcePath,
preparedInfoPath, workingPath,
outputPath, resultType,
resultClazz); resultClazz);
}); });
} }
@ -76,18 +80,15 @@ public class SparkCountryPropagationJob {
private static <R extends Result> void execPropagation( private static <R extends Result> void execPropagation(
SparkSession spark, SparkSession spark,
String sourcePath, String sourcePath,
String preparedInfoPath, String workingPath,
String outputPath, String resultType,
Class<R> resultClazz) { Class<R> resultClazz) {
log.info("Reading Graph table from: {}", sourcePath); log.info("Reading Graph table from: {}", sourcePath);
Dataset<R> res = readPath(spark, sourcePath, resultClazz); Dataset<R> res = readPath(spark, sourcePath, resultClazz);
log.info("Reading prepared info: {}", preparedInfoPath); log.info("Reading prepared info: {}", workingPath + "/preparedInfo/" + resultType);
Dataset<ResultCountrySet> prepared = spark Dataset<ResultCountrySet> prepared = readPath(spark, workingPath + "/preparedInfo/" + resultType, ResultCountrySet.class);
.read()
.json(preparedInfoPath)
.as(Encoders.bean(ResultCountrySet.class));
res res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer") .joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
@ -95,7 +96,13 @@ public class SparkCountryPropagationJob {
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.json(outputPath); .json(workingPath + "/" + resultType);
readPath(spark, workingPath + "/" + resultType, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(sourcePath);
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.countrypropagation; package eu.dnetlib.dhp.countrypropagation.pojo;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.countrypropagation; package eu.dnetlib.dhp.countrypropagation.pojo;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,5 +1,7 @@
package eu.dnetlib.dhp.countrypropagation; package eu.dnetlib.dhp.countrypropagation.pojo;
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;

View File

@ -1,7 +1,9 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel; package eu.dnetlib.dhp.orcidtoresultfromsemrel;
public class AutoritativeAuthor { import java.io.Serializable;
public class AutoritativeAuthor implements Serializable {
private String name; private String name;
private String surname; private String surname;
@ -40,4 +42,13 @@ public class AutoritativeAuthor {
this.orcid = orcid; this.orcid = orcid;
} }
public static AutoritativeAuthor newInstance(String name, String surname, String fullname, String orcid) {
AutoritativeAuthor aa = new AutoritativeAuthor();
aa.name = name;
aa.surname = surname;
aa.fullname = fullname;
aa.orcid = orcid;
return aa;
}
} }

View File

@ -0,0 +1,90 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import scala.Tuple2;
public class PrepareResultOrcidAssociationStep0 implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep0.class);
public static void main(String[] args) throws Exception {
String jsonConf = IOUtils
.toString(
PrepareResultOrcidAssociationStep0.class
.getResourceAsStream(
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult0_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final List<String> allowedsemrel = Arrays
.stream(parser.get("allowedsemrels").split(";"))
.map(s -> s.toLowerCase())
.collect(Collectors.toList());
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
selectRelations(
spark, inputPath, outputPath, allowedsemrel);
});
}
private static void selectRelations(SparkSession spark, String inputPath, String outputPath,
List<String> allowedsemrel) {
readPath(spark, inputPath, Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference()
&& allowedsemrel.contains(r.getRelClass().toLowerCase()))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -2,26 +2,37 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel; package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.sources.v2.reader.InputPartition;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import scala.Tuple2;
public class PrepareResultOrcidAssociationStep1 { public class PrepareResultOrcidAssociationStep1 {
private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class); private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);
@ -42,83 +53,112 @@ public class PrepareResultOrcidAssociationStep1 {
String inputPath = parser.get("sourcePath"); String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("workingPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); final List<String> allowedsemrel = Arrays
.stream(parser.get("allowedsemrels").split(";"))
.map(s -> s.toLowerCase())
.collect(Collectors.toList());
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
final List<String> allowedPids = Arrays.asList(parser.get("allowedpids").split(";"));
log.info("allowedPids: {}", new Gson().toJson(allowedPids));
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType); log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
String inputRelationPath = inputPath + "/relation"; runWithSparkSession(
log.info("inputRelationPath: {}", inputRelationPath);
String inputResultPath = inputPath + "/" + resultType;
log.info("inputResultPath: {}", inputResultPath);
String outputResultPath = outputPath + "/" + resultType;
log.info("outputResultPath: {}", outputResultPath);
runWithSparkHiveSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); // removeOutputDir(spark, outputPath);
prepareInfo( prepareInfo(
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel); spark, inputPath, outputPath, resultType, resultClazz, allowedsemrel, allowedPids);
}); });
} }
private static <R extends Result> void prepareInfo( private static <R extends Result> void prepareInfo(
SparkSession spark, SparkSession spark,
String inputRelationPath, String inputPath,
String inputResultPath, String outputPath,
String outputResultPath, String resultType,
Class<R> resultClazz, Class<R> resultClazz,
List<String> allowedsemrel) { List<String> allowedsemrel,
List<String> allowedPids) {
Dataset<Relation> relation = readPath(spark, inputRelationPath, Relation.class); final String inputResultPath = inputPath + "/" + resultType;
relation.createOrReplaceTempView("relation");
Dataset<Relation> relation = readPath(spark, outputPath + "/relationSubset", Relation.class);
log.info("Reading Graph table from: {}", inputResultPath); log.info("Reading Graph table from: {}", inputResultPath);
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
result.createOrReplaceTempView("result");
String query = "SELECT target resultId, author authorList" final String resultOutputPath = outputPath + "/resultSubset/" + resultType;
+ " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
+ " FROM ( "
+ " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
+ " FROM result "
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
+ " WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or "
+ " lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
+ " GROUP BY id) r_t "
+ " JOIN ("
+ " SELECT source, target "
+ " FROM relation "
+ " WHERE datainfo.deletedbyinference = false "
+ getConstraintList(" lower(relclass) = '", allowedsemrel)
+ " ) rel_rel "
+ " ON source = id";
log.info("executedQuery: {}", query); readPath(spark, inputResultPath, resultClazz)
spark .filter(
.sql(query) (FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible())
.as(Encoders.bean(ResultOrcidList.class)) .filter(
(FilterFunction<R>) r -> Optional
.ofNullable(r.getAuthor())
.map(
al -> al
.stream()
.anyMatch(
a -> hasAllowedPid(a, allowedPids)))
.orElse(false)
)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(resultOutputPath);
Dataset<R> result = readPath(spark, resultOutputPath, resultClazz);
// result.foreach((ForeachFunction<R>) r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
result
.joinWith(relation, result.col("id").equalTo(relation.col("source")))
.map((MapFunction<Tuple2<R, Relation>, ResultOrcidList>) t2 -> {
ResultOrcidList rol = new ResultOrcidList();
rol.setResultId(t2._2().getTarget());
List<AutoritativeAuthor> aal = new ArrayList<>();
t2._1().getAuthor().stream().forEach(a -> {
a.getPid().stream().forEach(p -> {
if (allowedPids.contains(p.getQualifier().getClassid().toLowerCase())) {
aal
.add(
AutoritativeAuthor
.newInstance(a.getName(), a.getSurname(), a.getFullname(), p.getValue()));
}
});
});
rol.setAuthorList(aal);
return rol;
}, Encoders.bean(ResultOrcidList.class))
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.json(outputResultPath); .json(outputPath + "/" + resultType);
}
private static boolean hasAllowedPid(Author a, List<String> allowedPids) {
Optional<List<StructuredProperty>> oPid = Optional.ofNullable(a.getPid());
if (!oPid.isPresent()) {
return false;
}
return oPid.get().stream().anyMatch(p -> allowedPids.contains(p.getQualifier().getClassid().toLowerCase()));
} }
} }

View File

@ -10,6 +10,8 @@ import java.util.Set;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -50,7 +52,7 @@ public class PrepareResultOrcidAssociationStep2 {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); // removeOutputDir(spark, outputPath);
mergeInfo(spark, inputPath, outputPath); mergeInfo(spark, inputPath, outputPath);
}); });
} }
@ -63,33 +65,31 @@ public class PrepareResultOrcidAssociationStep2 {
.union(readPath(spark, inputPath + "/software", ResultOrcidList.class)); .union(readPath(spark, inputPath + "/software", ResultOrcidList.class));
resultOrcidAssoc resultOrcidAssoc
.toJavaRDD() .groupByKey((MapFunction<ResultOrcidList, String>) rol -> rol.getResultId(), Encoders.STRING())
.mapToPair(r -> new Tuple2<>(r.getResultId(), r)) .mapGroups((MapGroupsFunction<String, ResultOrcidList, ResultOrcidList>) (k, it) -> {
.reduceByKey( ResultOrcidList resultOrcidList = it.next();
(a, b) -> { if (it.hasNext()) {
if (a == null) {
return b;
}
if (b == null) {
return a;
}
Set<String> orcid_set = new HashSet<>(); Set<String> orcid_set = new HashSet<>();
a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid())); resultOrcidList.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
b it
.getAuthorList() .forEachRemaining(
.stream() val -> val
.forEach( .getAuthorList()
aa -> { .stream()
if (!orcid_set.contains(aa.getOrcid())) { .forEach(
a.getAuthorList().add(aa); aa -> {
orcid_set.add(aa.getOrcid()); if (!orcid_set.contains(aa.getOrcid())) {
} resultOrcidList.getAuthorList().add(aa);
}); orcid_set.add(aa.getOrcid());
return a; }
}) }));
.map(Tuple2::_2) }
.map(r -> OBJECT_MAPPER.writeValueAsString(r)) return resultOrcidList;
.saveAsTextFile(outputPath, GzipCodec.class); }, Encoders.bean(ResultOrcidList.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
} }
} }

View File

@ -2,7 +2,7 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel; package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
@ -56,25 +56,16 @@ public class SparkOrcidToResultFromSemRelJob {
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, outputPath);
if (saveGraph) { execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
}
}); });
} }

View File

@ -51,8 +51,7 @@ public class SparkResultToProjectThroughSemRelJob {
final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
log.info("saveGraph: {}", saveGraph);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -60,11 +59,9 @@ public class SparkResultToProjectThroughSemRelJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
execPropagation( execPropagation(
spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph); spark, outputPath, alreadyLinkedPath, potentialUpdatePath);
}); });
} }
@ -72,13 +69,12 @@ public class SparkResultToProjectThroughSemRelJob {
SparkSession spark, SparkSession spark,
String outputPath, String outputPath,
String alreadyLinkedPath, String alreadyLinkedPath,
String potentialUpdatePath, String potentialUpdatePath) {
Boolean saveGraph) {
Dataset<ResultProjectSet> toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class); Dataset<ResultProjectSet> toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class);
Dataset<ResultProjectSet> alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class); Dataset<ResultProjectSet> alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class);
if (saveGraph) {
toaddrelations toaddrelations
.joinWith( .joinWith(
alreadyLinked, alreadyLinked,
@ -89,7 +85,7 @@ public class SparkResultToProjectThroughSemRelJob {
.mode(SaveMode.Append) .mode(SaveMode.Append)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
}
} }
private static FlatMapFunction<Tuple2<ResultProjectSet, ResultProjectSet>, Relation> mapRelationRn() { private static FlatMapFunction<Tuple2<ResultProjectSet, ResultProjectSet>, Relation> mapRelationRn() {

View File

@ -56,11 +56,7 @@ public class SparkResultToCommunityFromOrganizationJob {
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
@ -72,10 +68,9 @@ public class SparkResultToCommunityFromOrganizationJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath);
if (saveGraph) {
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath); execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
}
}); });
} }
@ -99,6 +94,12 @@ public class SparkResultToCommunityFromOrganizationJob {
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
readPath(spark, outputPath, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
} }
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> resultCommunityFn() { private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> resultCommunityFn() {

View File

@ -70,13 +70,10 @@ public class SparkResultToCommunityThroughSemRelJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
if (saveGraph) {
execPropagation( execPropagation(
spark, inputPath, outputPath, preparedInfoPath, resultClazz); spark, inputPath, outputPath, preparedInfoPath, resultClazz);
}
}); });
} }
@ -100,6 +97,12 @@ public class SparkResultToCommunityThroughSemRelJob {
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
readPath(spark, outputPath, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
} }
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> contextUpdaterFn() { private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> contextUpdaterFn() {

View File

@ -23,4 +23,11 @@ public class DatasourceOrganization implements Serializable {
public void setOrganizationId(String organizationId) { public void setOrganizationId(String organizationId) {
this.organizationId = organizationId; this.organizationId = organizationId;
} }
public static DatasourceOrganization newInstance(String datasourceId, String organizationId) {
DatasourceOrganization dso = new DatasourceOrganization();
dso.datasourceId = datasourceId;
dso.organizationId = organizationId;
return dso;
}
} }

View File

@ -2,17 +2,17 @@
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
@ -28,6 +28,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class PrepareResultInstRepoAssociation { public class PrepareResultInstRepoAssociation {
@ -49,14 +50,11 @@ public class PrepareResultInstRepoAssociation {
Boolean isSparkSessionManaged = isSparkSessionManaged(parser); Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath"); final String workingPath = parser.get("workingPath");
log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath); log.info("workingPath: {}", workingPath);
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
List<String> blacklist = Optional List<String> blacklist = Optional
.ofNullable(parser.get("blacklist")) .ofNullable(parser.get("blacklist"))
@ -64,82 +62,92 @@ public class PrepareResultInstRepoAssociation {
.orElse(new ArrayList<>()); .orElse(new ArrayList<>());
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
readNeededResources(spark, inputPath); readNeededResources(spark, inputPath, workingPath, blacklist);
removeOutputDir(spark, datasourceOrganizationPath); prepareDatasourceOrganization(spark, workingPath);
prepareDatasourceOrganization(spark, datasourceOrganizationPath, blacklist);
removeOutputDir(spark, alreadyLinkedPath); prepareAlreadyLinkedAssociation(spark, workingPath);
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
}); });
} }
private static void readNeededResources(SparkSession spark, String inputPath) { private static void readNeededResources(SparkSession spark, String inputPath, String workingPath,
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class); List<String> blacklist) {
datasource.createOrReplaceTempView("datasource"); readPath(spark, inputPath + "/datasource", Datasource.class)
.filter(
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class); (FilterFunction<Datasource>) ds -> !blacklist.contains(ds.getId()) &&
relation.createOrReplaceTempView("relation"); !ds.getDataInfo().getDeletedbyinference() &&
ds.getDatasourcetype().getClassid().equals(INSTITUTIONAL_REPO_TYPE))
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
organization.createOrReplaceTempView("organization");
}
private static void prepareDatasourceOrganization(
SparkSession spark, String datasourceOrganizationPath, List<String> blacklist) {
final String blacklisted = blacklist
.stream()
.map(s -> " AND id != '" + s + "'")
.collect(Collectors.joining());
String query = "SELECT source datasourceId, target organizationId "
+ "FROM ( SELECT id "
+ "FROM datasource "
+ "WHERE datasourcetype.classid = '"
+ INSTITUTIONAL_REPO_TYPE
+ "' "
+ "AND datainfo.deletedbyinference = false " + blacklisted + " ) d "
+ "JOIN ( SELECT source, target "
+ "FROM relation "
+ "WHERE lower(relclass) = '"
+ ModelConstants.IS_PROVIDED_BY.toLowerCase()
+ "' "
+ "AND datainfo.deletedbyinference = false ) rel "
+ "ON d.id = rel.source ";
spark
.sql(query)
.as(Encoders.bean(DatasourceOrganization.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(datasourceOrganizationPath); .json(workingPath + "/datasource");
readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
(r.getRelClass().toLowerCase().equals(ModelConstants.IS_PROVIDED_BY.toLowerCase()) ||
r.getRelClass().toLowerCase().equals(ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase())))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/relation");
}
private static void prepareDatasourceOrganization(
SparkSession spark, String workingPath) {
Dataset<Datasource> datasource = readPath(spark, workingPath + "/datasource", Datasource.class);
Dataset<Relation> relation = readPath(spark, workingPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> r
.getRelClass()
.toLowerCase()
.equals(ModelConstants.IS_PROVIDED_BY.toLowerCase()));
datasource
.joinWith(relation, datasource.col("id").equalTo(relation.col("source")))
.map(
(MapFunction<Tuple2<Datasource, Relation>, DatasourceOrganization>) t2 -> DatasourceOrganization
.newInstance(t2._2().getSource(), t2._2().getTarget()),
Encoders.bean(DatasourceOrganization.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/ datasourceOrganization");
;
} }
private static void prepareAlreadyLinkedAssociation( private static void prepareAlreadyLinkedAssociation(
SparkSession spark, String alreadyLinkedPath) { SparkSession spark, String workingPath) {
String query = "Select source key, collect_set(target) valueSet "
+ "from relation " readPath(spark, workingPath + "/relation", Relation.class)
+ "where datainfo.deletedbyinference = false " .filter(
+ "and lower(relClass) = '" (FilterFunction<Relation>) r -> r
+ ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase() .getRelClass()
+ "' " .toLowerCase()
+ "group by source"; .equals(ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase()))
.groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Relation, KeyValueSet>) (k, it) -> {
Set<String> values = new HashSet<>();
KeyValueSet kvs = new KeyValueSet();
kvs.setKey(k);
values.add(it.next().getTarget());
it.forEachRemaining(r -> values.add(r.getTarget()));
kvs.setValueSet(new ArrayList<>(values));
return kvs;
}, Encoders.bean(KeyValueSet.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/alreadyLinked");
spark
.sql(query)
.as(Encoders.bean(KeyValueSet.class))
// TODO retry to stick with datasets
.toJavaRDD()
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
} }
} }

View File

@ -11,10 +11,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.*;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -71,12 +68,6 @@ public class SparkResultToOrganizationFromIstRepoJob {
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName); Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
@ -86,15 +77,15 @@ public class SparkResultToOrganizationFromIstRepoJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (saveGraph) {
execPropagation( execPropagation(
spark, spark,
datasourceorganization, datasourceorganization,
alreadylinked, alreadylinked,
inputPath, inputPath,
outputPath, outputPath,
resultClazz); resultClazz);
}
}); });
} }
@ -119,9 +110,15 @@ public class SparkResultToOrganizationFromIstRepoJob {
"left_outer") "left_outer")
.flatMap(createRelationFn(), Encoders.bean(Relation.class)) .flatMap(createRelationFn(), Encoders.bean(Relation.class))
.write() .write()
.mode(SaveMode.Append) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
readPath(spark, outputPath, Relation.class)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(inputPath.substring(0, inputPath.lastIndexOf("/") + 1) + "relation");
} }
private static FlatMapFunction<Tuple2<KeyValueSet, KeyValueSet>, Relation> createRelationFn() { private static FlatMapFunction<Tuple2<KeyValueSet, KeyValueSet>, Relation> createRelationFn() {
@ -159,8 +156,16 @@ public class SparkResultToOrganizationFromIstRepoJob {
Dataset<R> result = readPath(spark, inputPath, resultClazz); Dataset<R> result = readPath(spark, inputPath, resultClazz);
result.createOrReplaceTempView("result"); result.createOrReplaceTempView("result");
createCfHbforResult(spark);
Dataset<Row> cfhb = spark
.sql(
"select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
+
"from result r " +
"lateral view explode(instance) i as inst " +
"where r.datainfo.deletedbyinference=false");
// createCfHbforResult(spark);
cfhb.createOrReplaceTempView("cfhb");
dsOrg.createOrReplaceTempView("rels"); dsOrg.createOrReplaceTempView("rels");
return spark return spark

View File

@ -98,13 +98,13 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
String leavesPath, String leavesPath,
String childParentPath, String childParentPath,
String resultOrganizationPath, String resultOrganizationPath,
String graphPath, String relationPath,
String workingPath, String workingPath,
String outputPath, String outputPath,
int iterations) { int iterations) {
if (iterations == 1) { if (iterations == 1) {
doPropagateOnce( doPropagateOnce(
spark, leavesPath, childParentPath, resultOrganizationPath, graphPath, spark, leavesPath, childParentPath, resultOrganizationPath, relationPath,
workingPath, outputPath); workingPath, outputPath);
} else { } else {
@ -123,26 +123,26 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
notReachedFirstParent); notReachedFirstParent);
doPropagate( doPropagate(
spark, leavesPath, childParentPath, resultOrganizationPath, graphPath, spark, leavesPath, childParentPath, resultOrganizationPath, relationPath,
workingPath, outputPath, propagationCounter); workingPath, outputPath, propagationCounter);
} }
} }
private static void doPropagateOnce(SparkSession spark, String leavesPath, String childParentPath, private static void doPropagateOnce(SparkSession spark, String leavesPath, String childParentPath,
String resultOrganizationPath, String graphPath, String workingPath, String resultOrganizationPath, String relationPath, String workingPath,
String outputPath) { String outputPath) {
StepActions StepActions
.execStep( .execStep(
spark, graphPath, workingPath + NEW_RELATION_PATH, spark, relationPath, workingPath + NEW_RELATION_PATH,
leavesPath, childParentPath, resultOrganizationPath); leavesPath, childParentPath, resultOrganizationPath);
addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath); addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath);
} }
private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath, private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath,
String resultOrganizationPath, String graphPath, String workingPath, String outputPath, String resultOrganizationPath, String relationPath, String workingPath, String outputPath,
PropagationCounter propagationCounter) { PropagationCounter propagationCounter) {
int iteration = 0; int iteration = 0;
long leavesCount; long leavesCount;
@ -151,7 +151,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
iteration++; iteration++;
StepActions StepActions
.execStep( .execStep(
spark, graphPath, workingPath + NEW_RELATION_PATH, spark, relationPath, workingPath + NEW_RELATION_PATH,
leavesPath, childParentPath, resultOrganizationPath); leavesPath, childParentPath, resultOrganizationPath);
StepActions StepActions
.prepareForNextStep( .prepareForNextStep(
@ -225,7 +225,6 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
, Encoders.bean(Relation.class)) , Encoders.bean(Relation.class))
.write() .write()
.mode(SaveMode.Append) .mode(SaveMode.Append)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);

View File

@ -27,10 +27,10 @@ import scala.Tuple2;
public class StepActions implements Serializable { public class StepActions implements Serializable {
public static void execStep(SparkSession spark, public static void execStep(SparkSession spark,
String graphPath, String newRelationPath, String relationPath, String newRelationPath,
String leavesPath, String chldParentOrgPath, String resultOrgPath) { String leavesPath, String chldParentOrgPath, String resultOrgPath) {
Dataset<Relation> relationGraph = readPath(spark, graphPath, Relation.class); Dataset<Relation> relationGraph = readPath(spark, relationPath, Relation.class);
// select only the relation source target among those proposed by propagation that are not already existent // select only the relation source target among those proposed by propagation that are not already existent
getNewRels( getNewRels(
newRelationPath, relationGraph, newRelationPath, relationGraph,

View File

@ -29,6 +29,13 @@
"paramLongName": "isSparkSessionManaged", "paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise", "paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false "paramRequired": false
},
{
"paramName": "rt",
"paramLongName": "resultType",
"paramDescription": "the result type",
"paramRequired": true
} }
] ]

View File

@ -11,17 +11,12 @@
"paramDescription": "the name of the result table we are currently working on", "paramDescription": "the name of the result table we are currently working on",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "out", "paramName": "wp",
"paramLongName": "outputPath", "paramLongName": "workingPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "p",
"paramLongName": "preparedInfoPath",
"paramDescription": "the path where prepared info have been stored", "paramDescription": "the path where prepared info have been stored",
"paramRequired": false "paramRequired": true
}, },
{ {
"paramName": "ssm", "paramName": "ssm",

View File

@ -6,8 +6,8 @@
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "out", "paramName": "wp",
"paramLongName": "outputPath", "paramLongName": "workingPath",
"paramDescription": "the path used to store temporary output files", "paramDescription": "the path used to store temporary output files",
"paramRequired": true "paramRequired": true
}, },

View File

@ -5,12 +5,6 @@
"paramDescription": "the path of the sequencial file to read", "paramDescription": "the path of the sequencial file to read",
"paramRequired": true "paramRequired": true
}, },
{
"paramName":"out",
"paramLongName":"outputPath",
"paramDescription": "the output path",
"paramRequired": true
},
{ {
"paramName":"w", "paramName":"w",
"paramLongName":"workingPath", "paramLongName":"workingPath",
@ -23,12 +17,7 @@
"paramDescription": "the name of the result table we are currently working on", "paramDescription": "the name of the result table we are currently working on",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "p",
"paramLongName": "preparedInfoPath",
"paramDescription": "the path where prepared info have been stored",
"paramRequired": true
},
{ {
"paramName": "ssm", "paramName": "ssm",
"paramLongName": "isSparkSessionManaged", "paramLongName": "isSparkSessionManaged",

View File

@ -5,12 +5,6 @@
"paramDescription": "the path of the sequencial file to read", "paramDescription": "the path of the sequencial file to read",
"paramRequired": true "paramRequired": true
}, },
{
"paramName":"sg",
"paramLongName":"saveGraph",
"paramDescription": "true if the new version of the graph must be saved",
"paramRequired": false
},
{ {
"paramName":"h", "paramName":"h",
"paramLongName":"hive_metastore_uris", "paramLongName":"hive_metastore_uris",

View File

@ -0,0 +1,26 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"as",
"paramLongName":"allowedsemrels",
"paramDescription": "the allowed sematinc relations for propagation",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
]

View File

@ -12,14 +12,14 @@
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName":"h", "paramName":"ap",
"paramLongName":"hive_metastore_uris", "paramLongName":"allowedpids",
"paramDescription": "the hive metastore uris", "paramDescription": "the allowed pid type to be used for propagation",
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "out", "paramName": "wp",
"paramLongName": "outputPath", "paramLongName": "workingPath",
"paramDescription": "the path used to store temporary output files", "paramDescription": "the path used to store temporary output files",
"paramRequired": true "paramRequired": true
}, },

View File

@ -1,184 +0,0 @@
<workflow-app name="project_to_result_propagation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>allowedsemrels</name>
<description>the allowed semantics </description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="reset_outputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_publication"/>
<path start="copy_dataset"/>
<path start="copy_orp"/>
<path start="copy_software"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${outputPath}/publication</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${outputPath}/dataset</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_orp">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${outputPath}/software</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="prepare_project_results_association"/>
<action name="prepare_project_results_association">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareProjectResultsAssociation</name>
<class>eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
</spark>
<ok to="apply_propagation"/>
<error to="Kill"/>
</action>
<action name="apply_propagation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ProjectToResultPropagation</name>
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -6,21 +6,9 @@
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName":"h", "paramName":"wp",
"paramLongName":"hive_metastore_uris", "paramLongName":"workingPath",
"paramDescription": "the hive metastore uris", "paramDescription": "path where to store/find prepared/ filtered data",
"paramRequired": true
},
{
"paramName":"dop",
"paramLongName":"datasourceOrganizationPath",
"paramDescription": "path where to store/find association from datasource and organization",
"paramRequired": true
},
{
"paramName":"alp",
"paramLongName":"alreadyLinkedPath",
"paramDescription": "path where to store/find already linked results and organizations",
"paramRequired": true "paramRequired": true
}, },
{ {

View File

@ -1,195 +0,0 @@
<workflow-app name="affiliation_from_semrel_propagation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>sets the outputPath</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="resume_from">
<switch>
<case to="prepare_info">${wf:conf('resumeFrom') eq 'PrepareInfo'}</case>
<default to="reset_outputpath"/> <!-- first action to be done when downloadDump is to be performed -->
</switch>
</decision>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_publication"/>
<path start="copy_dataset"/>
<path start="copy_orp"/>
<path start="copy_software"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${outputPath}/publication</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${outputPath}/dataset</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_orp">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${outputPath}/software</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="prepare_info"/>
<action name="prepare_info">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResultOrganizationAssociation</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.PrepareInfo</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--graphPath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
</spark>
<ok to="apply_resulttoorganization_propagation"/>
<error to="Kill"/>
</action>
<action name="apply_resulttoorganization_propagation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>resultToOrganizationFromSemRel</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.SparkResultToOrganizationFromSemRel</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--workingDir</arg><arg>${workingDir}/working</arg>
<arg>--iterations</arg><arg>${iterations}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,9 @@
## This is a classpath-based import file (this header is required)
orcid_propagation classpath eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app
bulk_tagging classpath eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app
affiliation_inst_repo classpath eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app
affiliation_semantic_relation classpath eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfromsemrel/oozie_app
community_organization classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app
result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app
community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app
country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app

View File

@ -0,0 +1,312 @@
<workflow-app name="enrichment_main" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>allowedsemrelsorcidprop</name>
<description>the semantic relationships allowed for propagation</description>
</property>
<property>
<name>allowedsemrelsresultproject</name>
<description>the allowed semantics </description>
</property>
<property>
<name>allowedsemrelscommunitysemrel</name>
<description>the semantic relationships allowed for propagation</description>
</property>
<property>
<name>datasourceWhitelistForCountryPropagation</name>
<description>the white list</description>
</property>
<property>
<name>allowedtypes</name>
<description>the allowed types</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>organizationtoresultcommunitymap</name>
<description>organization community map</description>
</property>
<property>
<name>isLookUpUrl</name>
<description>the isLookup service endpoint</description>
</property>
<property>
<name>pathMap</name>
<description>the json path associated to each selection field</description>
</property>
<property>
<name>blacklist</name>
<description>list of datasources in blacklist for the affiliation from instrepo propagation</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resumeFrom"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="resumeFrom">
<switch>
<case to="bulk_tagging">${wf:conf('resumeFrom') eq 'BulkTagging'}</case>
<case to="affiliation_inst_repo">${wf:conf('resumeFrom') eq 'AffiliationInstitutionalRepository'}</case>
<case to="affiliation_semantic_relation">${wf:conf('resumeFrom') eq 'AffiliationSemanticRelation'}</case>
<case to="community_organization">${wf:conf('resumeFrom') eq 'CommunityOrganization'}</case>
<case to="result_project">${wf:conf('resumeFrom') eq 'ResultProject'}</case>
<case to="community_sem_rel">${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'}</case>
<case to="country_propagation">${wf:conf('resumeFrom') eq 'CountryPropagation'}</case>
<default to="orcid_propagation"/>
</switch>
</decision>
<action name="orcid_propagation">
<sub-workflow>
<app-path>${wf:appPath()}/orcid_propagation
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${sourcePath}</value>
</property>
<property>
<name>allowedsemrels</name>
<value>${allowedsemrelsorcidprop}</value>
</property>
<property>
<name>outputPath</name>
<value>${outputPath}</value>
</property>
</configuration>
</sub-workflow>
<ok to="bulk_tagging" />
<error to="Kill" />
</action>
<action name="bulk_tagging">
<sub-workflow>
<app-path>${wf:appPath()}/bulk_tagging
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>isLookUpUrl</name>
<value>${isLookUpUrl}</value>
</property>
<property>
<name>pathMap</name>
<value>${pathMap}</value>
</property>
</configuration>
</sub-workflow>
<ok to="affiliation_inst_repo" />
<error to="Kill" />
</action>
<action name="affiliation_inst_repo">
<sub-workflow>
<app-path>${wf:appPath()}/affiliation_inst_repo
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>outputPath</name>
<value>${workingDir}/relations</value>
</property>
<property>
<name>blacklist</name>
<value>${blacklist}</value>
</property>
</configuration>
</sub-workflow>
<ok to="affiliation_semantic_relation" />
<error to="Kill" />
</action>
<action name="affiliation_semantic_relation">
<sub-workflow>
<app-path>${wf:appPath()}/affiliation_semantic_relation
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
</configuration>
</sub-workflow>
<ok to="community_organization" />
<error to="Kill" />
</action>
<action name="community_organization">
<sub-workflow>
<app-path>${wf:appPath()}/community_organization
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>organizationtoresultcommunitymap</name>
<value>${organizationtoresultcommunitymap}</value>
</property>
</configuration>
</sub-workflow>
<ok to="result_project" />
<error to="Kill" />
</action>
<action name="result_project">
<sub-workflow>
<app-path>${wf:appPath()}/result_project
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>allowedsemrels</name>
<value>${allowedsemrelsresultproject}</value>
</property>
</configuration>
</sub-workflow>
<ok to="community_sem_rel" />
<error to="Kill" />
</action>
<action name="community_sem_rel">
<sub-workflow>
<app-path>${wf:appPath()}/community_sem_rel
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>allowedsemrels</name>
<value>${allowedsemrelscommunitysemrel}</value>
</property>
</configuration>
</sub-workflow>
<ok to="country_propagation" />
<error to="Kill" />
</action>
<action name="country_propagation">
<sub-workflow>
<app-path>${wf:appPath()}/country_propagation
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>whitelist</name>
<value>${datasourceWhitelistForCountryPropagation}</value>
</property>
<property>
<name>allowedtypes</name>
<value>${allowedtupes}</value>
</property>
</configuration>
</sub-workflow>
<ok to="End" />
<error to="Kill" />
</action>
<end name="End"/>
</workflow-app>

View File

@ -44,65 +44,12 @@
</configuration> </configuration>
</global> </global>
<start to="reset_outputpath"/> <start to="fork_exec_bulktag"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="fork_exec_bulktag"/>
<fork name="fork_exec_bulktag"> <fork name="fork_exec_bulktag">
<path start="join_bulktag_publication"/> <path start="join_bulktag_publication"/>
@ -130,7 +77,7 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg> <arg>--outputPath</arg><arg>${workingDir}/bulktag/publication</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg> <arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark> </spark>
@ -157,7 +104,7 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg> <arg>--outputPath</arg><arg>${workingDir}/bulktag/dataset</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg> <arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark> </spark>
@ -184,7 +131,7 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${workingDir}/bulktag/otherresearchproduct</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg> <arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark> </spark>
@ -211,7 +158,7 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg> <arg>--outputPath</arg><arg>${workingDir}/bulktag/software</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg> <arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark> </spark>
@ -239,7 +186,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}</arg> <arg>--sourcePath</arg><arg>${outputPath}</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscTag</arg> <arg>--workingPath</arg><arg>${workingDir}/bulktag</arg>
</spark> </spark>
<ok to="eosc_get_datasource_master"/> <ok to="eosc_get_datasource_master"/>
<error to="Kill"/> <error to="Kill"/>
@ -283,7 +230,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/publication</arg> <arg>--sourcePath</arg><arg>${outputPath}/publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/publication</arg> <arg>--workingPath</arg><arg>${workingDir}/bulktag/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg> <arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark> </spark>
@ -309,7 +256,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/dataset</arg> <arg>--sourcePath</arg><arg>${outputPath}/dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/dataset</arg> <arg>--workingPath</arg><arg>${workingDir}/bulktag/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg> <arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark> </spark>
@ -334,7 +281,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/software</arg> <arg>--sourcePath</arg><arg>${outputPath}/software</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/software</arg> <arg>--workingPath</arg><arg>${workingDir}/bulktag/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg> <arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark> </spark>
@ -359,14 +306,24 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/otherresearchproduct</arg> <arg>--workingPath</arg><arg>${workingDir}/bulktag/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg> <arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark> </spark>
<ok to="wait_eosc_context_tag"/> <ok to="wait_eosc_context_tag"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait_eosc_context_tag" to="End"/> <join name="wait_eosc_context_tag" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -12,11 +12,6 @@
<name>allowedtypes</name> <name>allowedtypes</name>
<description>the allowed types</description> <description>the allowed types</description>
</property> </property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
</parameters> </parameters>
<global> <global>
@ -30,65 +25,13 @@
</configuration> </configuration>
</global> </global>
<start to="reset_outputpath"/> <start to="prepare_datasource_country_association"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="prepare_datasource_country_association"/>
<action name="prepare_datasource_country_association"> <action name="prepare_datasource_country_association">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
@ -110,20 +53,20 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--whitelist</arg><arg>${whitelist}</arg> <arg>--whitelist</arg><arg>${whitelist}</arg>
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg> <arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
</spark> </spark>
<ok to="fork_join_prepare_result_country"/> <ok to="fork_prepare_result_country"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name="fork_join_prepare_result_country"> <fork name="fork_prepare_result_country">
<path start="join_prepareresult_publication"/> <path start="prepareresult_publication"/>
<path start="join_prepareresult_dataset"/> <path start="prepareresult_dataset"/>
<path start="join_prepareresult_otherresearchproduct"/> <path start="prepareresult_otherresearchproduct"/>
<path start="join_prepareresult_software"/> <path start="prepareresult_software"/>
</fork> </fork>
<action name="join_prepareresult_publication"> <action name="prepareresult_publication">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -144,16 +87,14 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingP</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark> </spark>
<ok to="wait_prepare"/> <ok to="wait_prepare"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_prepareresult_dataset"> <action name="prepareresult_dataset">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -174,16 +115,14 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingD</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark> </spark>
<ok to="wait_prepare"/> <ok to="wait_prepare"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_prepareresult_otherresearchproduct"> <action name="prepareresult_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -204,16 +143,14 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingO</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark> </spark>
<ok to="wait_prepare"/> <ok to="wait_prepare"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_prepareresult_software"> <action name="prepareresult_software">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -234,10 +171,8 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingS</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark> </spark>
<ok to="wait_prepare"/> <ok to="wait_prepare"/>
<error to="Kill"/> <error to="Kill"/>
@ -245,14 +180,14 @@
<join name="wait_prepare" to="fork_join_apply_country_propagation"/> <join name="wait_prepare" to="fork_join_apply_country_propagation"/>
<fork name="fork_join_apply_country_propagation"> <fork name="fork_apply_country_propagation">
<path start="join_propagation_publication"/> <path start="propagation_publication"/>
<path start="join_propagation_dataset"/> <path start="propagation_dataset"/>
<path start="join_propagation_otherresearchproduct"/> <path start="propagation_otherresearchproduct"/>
<path start="join_propagation_software"/> <path start="propagation_software"/>
</fork> </fork>
<action name="join_propagation_publication"> <action name="propagation_publication">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -273,15 +208,15 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_propagation_dataset"> <action name="propagation_dataset">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -302,15 +237,15 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_propagation_otherresearchproduct"> <action name="propagation_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -331,15 +266,15 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_propagation_software"> <action name="propagation_software">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
@ -360,16 +295,22 @@
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg> <arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg> </spark>
</spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait" to="End"/> <join name="wait" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -80,7 +80,37 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="copy_wait" to="fork_prepare_assoc_step1"/> <join name="copy_wait" to="prepare_relations"/>
<action name="prepare_relations">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ORCIDPropagation-PreparePhase0-SelectRELATIONS</name>
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep0</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.sql.shuffle.partitions=3840
--conf spark.speculation=false
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/relationSubset</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
</spark>
<ok to="fork_prepare_assoc_step1"/>
<error to="Kill"/>
</action>
<fork name="fork_prepare_assoc_step1"> <fork name="fork_prepare_assoc_step1">
<path start="join_prepare_publication"/> <path start="join_prepare_publication"/>
@ -112,10 +142,10 @@
--conf spark.hadoop.mapreduce.reduce.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg> <arg>--workingPath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg> <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--allowedpids</arg><arg>${allowedpids}</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -140,10 +170,10 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg> <arg>--workingPath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg> <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--allowedpids</arg><arg>${allowedpids}</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -168,10 +198,10 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg> <arg>--workingPath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg> <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--allowedpids</arg><arg>${allowedpids}</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -196,10 +226,10 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg> <arg>--workingPath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg> <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--allowedpids</arg><arg>${allowedpids}</arg>
</spark> </spark>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
@ -225,8 +255,8 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg> <arg>--sourcePath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg> <arg>--outputPath</arg><arg>${workingDir}/orcidprop//mergedOrcidAssoc</arg>
</spark> </spark>
<ok to="fork-join-exec-propagation"/> <ok to="fork-join-exec-propagation"/>
<error to="Kill"/> <error to="Kill"/>
@ -261,9 +291,8 @@
--conf spark.hadoop.mapreduce.reduce.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=3840
</spark-opts> </spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg> <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg> <arg>--outputPath</arg><arg>${outputPath}/publication</arg>
</spark> </spark>
@ -292,9 +321,8 @@
--conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts> </spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg> <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg> <arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
</spark> </spark>
@ -323,9 +351,8 @@
--conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts> </spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg> <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
</spark> </spark>
@ -354,9 +381,8 @@
--conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts> </spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg> <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg> <arg>--outputPath</arg><arg>${outputPath}/software</arg>
</spark> </spark>
@ -364,7 +390,16 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait2" to="End"/> <join name="wait2" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>

View File

@ -0,0 +1,94 @@
<workflow-app name="project_to_result_propagation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>allowedsemrels</name>
<description>the allowed semantics </description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="prepare_project_results_association"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="prepare_project_results_association">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareProjectResultsAssociation</name>
<class>eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/resultproject/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/resultproject/preparedInfo/alreadyLinked</arg>
</spark>
<ok to="apply_propagation"/>
<error to="Kill"/>
</action>
<action name="apply_propagation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ProjectToResultPropagation</name>
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--outputPath</arg><arg>${sourcePath}/relation</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/resultproject/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/resultproject/preparedInfo/alreadyLinked</arg>
</spark>
<ok to="reset_workingDir"/>
<error to="Kill"/>
</action>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -8,10 +8,7 @@
<name>organizationtoresultcommunitymap</name> <name>organizationtoresultcommunitymap</name>
<description>organization community map</description> <description>organization community map</description>
</property> </property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
</parameters> </parameters>
<global> <global>
@ -25,66 +22,12 @@
</configuration> </configuration>
</global> </global>
<start to="reset_outputpath"/> <start to="prepare_result_communitylist"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="prepare_result_communitylist"/>
<action name="prepare_result_communitylist"> <action name="prepare_result_communitylist">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -104,7 +47,7 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg> <arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--outputPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg> <arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg>
</spark> </spark>
@ -137,12 +80,12 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg> <arg>--outputPath</arg><arg>${workingDir}/communityorganization/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -166,12 +109,12 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg> <arg>--outputPath</arg><arg>${workingDir}/communityorganization/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -195,12 +138,12 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${workingDir}/communityorganization/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -224,19 +167,27 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg> <arg>--outputPath</arg><arg>${workingDir}/communityorganization/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait2" to="End"/> <join name="wait2" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -18,73 +18,13 @@
</property> </property>
</parameters> </parameters>
<start to="reset_outputpath"/> <start to="fork_prepare_assoc_step1"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
<fork name="fork_prepare_assoc_step1"> <fork name="fork_prepare_assoc_step1">
<path start="join_prepare_publication"/> <path start="join_prepare_publication"/>
@ -114,7 +54,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg> <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark> </spark>
@ -143,7 +83,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg> <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark> </spark>
@ -172,7 +112,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg> <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark> </spark>
@ -201,7 +141,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg> <arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark> </spark>
@ -229,8 +169,8 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg> <arg>--sourcePath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
</spark> </spark>
<ok to="fork-join-exec-propagation"/> <ok to="fork-join-exec-propagation"/>
<error to="Kill"/> <error to="Kill"/>
@ -261,12 +201,12 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -290,12 +230,12 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -319,12 +259,12 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/otherresearchproduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -348,19 +288,26 @@
--conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg> <arg>--outputPath</arg><arg>${workingDir}/communitysemrel/software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait2" to="End"/> <join name="wait2" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -21,105 +21,12 @@
</configuration> </configuration>
</global> </global>
<start to="reset_outputpath"/> <start to="prepare_result_organization_association"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_publication"/>
<path start="copy_dataset"/>
<path start="copy_orp"/>
<path start="copy_software"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${outputPath}/publication</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${outputPath}/dataset</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_orp">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${outputPath}/software</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="prepare_result_organization_association"/>
<action name="prepare_result_organization_association"> <action name="prepare_result_organization_association">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
@ -138,9 +45,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--workingPath</arg><arg>${workingDir}/affiliationInstRepo</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--blacklist</arg><arg>${blacklist}</arg> <arg>--blacklist</arg><arg>${blacklist}</arg>
</spark> </spark>
<ok to="fork_join_apply_resulttoorganization_propagation"/> <ok to="fork_join_apply_resulttoorganization_propagation"/>
@ -174,8 +79,8 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
</spark> </spark>
@ -203,8 +108,8 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
</spark> </spark>
@ -232,8 +137,8 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
</spark> </spark>
@ -261,8 +166,8 @@
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
</spark> </spark>

View File

@ -0,0 +1,97 @@
<workflow-app name="affiliation_from_semrel_propagation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="prepare_info"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="prepare_info">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResultOrganizationAssociation</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.PrepareInfo</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--graphPath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--leavesPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/resultOrgPath</arg>
<arg>--relationPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/relation</arg>
</spark>
<ok to="apply_resulttoorganization_propagation"/>
<error to="Kill"/>
</action>
<action name="apply_resulttoorganization_propagation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>resultToOrganizationFromSemRel</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.SparkResultToOrganizationFromSemRel</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--relationPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/relation</arg>
<arg>--outputPath</arg><arg>${sourcePath}</arg>
<arg>--leavesPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/resultOrgPath</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--workingDir</arg><arg>${workingDir}/affiliationSemanticRelation/working</arg>
<arg>--iterations</arg><arg>${iterations}</arg>
</spark>
<ok to="reset_workingDir"/>
<error to="Kill"/>
</action>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -27,16 +27,11 @@ import org.slf4j.LoggerFactory;
*/ */
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bulktag.eosc.DatasourceMaster;
import eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag; import eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
//"50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea" has instance hostedby eosc
//"50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1" has instance hostedby eosc
//"50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7" has two instance one hostedby eosc
//"50|475c1990cbb2::3894c94123e96df8a21249957cf160cb" has EoscTag
public class EOSCContextTaggingTest { public class EOSCContextTaggingTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -78,6 +73,22 @@ public class EOSCContextTaggingTest {
@Test @Test
void EoscContextTagTest() throws Exception { void EoscContextTagTest() throws Exception {
//"50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea" has instance hostedby eosc (cris)
//"50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1" has instance hostedby eosc (zenodo)
//"50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7" has two instance one hostedby eosc (wrong compatibility)
//"50|475c1990cbb2::3894c94123e96df8a21249957cf160cb" has EoscTag
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/datasource/datasource_1").getPath())
.map(
(MapFunction<String, Datasource>) value -> OBJECT_MAPPER.readValue(value, Datasource.class),
Encoders.bean(Datasource.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/datasource");
spark spark
.read() .read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath()) .textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath())
@ -94,17 +105,24 @@ public class EOSCContextTaggingTest {
new String[] { new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", "-sourcePath",
workingDir.toString() + "/input/dataset", workingDir.toString() + "/input/",
"-workingPath", workingDir.toString() + "/working/dataset", "-workingPath", workingDir.toString() + "/working/",
"-datasourceMapPath", "-datasourceMapPath",
getClass() getClass()
.getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster") .getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster")
.getPath(), .getPath(),
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset" "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-resultType", "dataset"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Assertions
.assertEquals(
2, sc
.textFile(workingDir.toString() + "/working/datasource")
.map(item -> OBJECT_MAPPER.readValue(item, DatasourceMaster.class))
.count());
JavaRDD<Dataset> tmp = sc JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/input/dataset") .textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
@ -113,7 +131,7 @@ public class EOSCContextTaggingTest {
Assertions Assertions
.assertEquals( .assertEquals(
4, 2,
tmp tmp
.filter( .filter(
s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
@ -140,17 +158,17 @@ public class EOSCContextTaggingTest {
Assertions Assertions
.assertEquals( .assertEquals(
1, 0,
tmp tmp
.filter( .filter(
d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb") d -> d.getId().equals("50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7")
&& &&
d.getContext().stream().anyMatch(c -> c.getId().equals("eosc"))) d.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
1, 0,
tmp tmp
.filter( .filter(
d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb") d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb")
@ -159,4 +177,62 @@ public class EOSCContextTaggingTest {
.count()); .count());
} }
@Test
void EoscContextTagTestEmptyDatasource() throws Exception {
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/datasource/datasource").getPath())
.map(
(MapFunction<String, Datasource>) value -> OBJECT_MAPPER.readValue(value, Datasource.class),
Encoders.bean(Datasource.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/datasource");
SparkEoscBulkTag
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
workingDir.toString() + "/input/",
"-workingPath", workingDir.toString() + "/working/",
"-datasourceMapPath",
getClass()
.getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster")
.getPath(),
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-resultType", "dataset"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
Assertions
.assertEquals(
0,
tmp
.filter(
s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count());
}
} }

View File

@ -0,0 +1,634 @@
package eu.dnetlib.dhp.countrypropagation;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Software;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import scala.Tuple2;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
/**
* @author miriam.baglioni
* @Date 23/11/22
*/
public class CountryPropagationAllStepsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DatasourceCountryPreparationTest.class.getSimpleName());
SparkConf conf = new SparkConf();
conf.setAppName(DatasourceCountryPreparationTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DatasourceCountryPreparationTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void allStepsTest() throws Exception {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph")
.getPath();
PrepareDatasourceCountryAssociation
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--workingPath", workingDir.toString() + "/country",
"--allowedtypes", "pubsrepository::institutional",
"--whitelist",
"10|openaire____::3795d6478e30e2c9f787d427ff160944;10|opendoar____::16e6a3326dd7d868cbc926602a61e4d0;10|eurocrisdris::fe4903425d9040f680d8610d9079ea14;10|openaire____::5b76240cc27a58c6f7ceef7d8c36660e;10|openaire____::172bbccecf8fca44ab6a6653e84cb92a;10|openaire____::149c6590f8a06b46314eed77bfca693f;10|eurocrisdris::a6026877c1a174d60f81fd71f62df1c1;10|openaire____::4692342f0992d91f9e705c26959f09e0;10|openaire____::8d529dbb05ec0284662b391789e8ae2a;10|openaire____::345c9d171ef3c5d706d08041d506428c;10|opendoar____::1c1d4df596d01da60385f0bb17a4a9e0;10|opendoar____::7a614fd06c325499f1680b9896beedeb;10|opendoar____::1ee3dfcd8a0645a25a35977997223d22;10|opendoar____::d296c101daa88a51f6ca8cfc1ac79b50;10|opendoar____::798ed7d4ee7138d49b8828958048130a;10|openaire____::c9d2209ecc4d45ba7b4ca7597acb88a2;10|eurocrisdris::c49e0fe4b9ba7b7fab717d1f0f0a674d;10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539;10|eurocrisdris::432ca599953ff50cd4eeffe22faf3e48"
});
sc.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication")
.getPath()).saveAsTextFile(workingDir.toString() + "/source/publication");
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/software")
.getPath()).saveAsTextFile(workingDir.toString() + "/source/software");
verifyDatasourceCountry();
PrepareResultCountrySet
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--workingPath", workingDir.toString() + "/country",
"--sourcePath", workingDir.toString() + "/source/publication",
"--resultTableName", Publication.class.getCanonicalName()
});
verifyResultCountrySet();
PrepareResultCountrySet
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--workingPath", workingDir.toString() + "/country",
"--sourcePath", workingDir.toString() + "/source/software",
"--resultTableName", Software.class.getCanonicalName()
});
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath",workingDir.toString() + "/source/publication",
"-resultTableName", Publication.class.getCanonicalName(),
"-workingPath", workingDir.toString() +"/country"
});
verifyPropagationPublication();
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath",workingDir.toString() + "/source/software",
"-resultTableName", Software.class.getCanonicalName(),
"-workingPath", workingDir.toString() + "/country"
});
verifyPropagationSoftware();
}
void verifyDatasourceCountry(){
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<DatasourceCountry> tmp = sc
.textFile(workingDir.toString() + "/country/datasourceCountry")
.map(item -> OBJECT_MAPPER.readValue(item, DatasourceCountry.class));
Assertions.assertEquals(3, tmp.count());
Assertions
.assertEquals(
1, tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::fe4903425d9040f680d8610d9079ea14"))
.count());
Assertions
.assertEquals(
1, tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|opendoar____::f0dd4a99fba6075a9494772b58f95280"))
.count());
Assertions
.assertEquals(
1, tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539"))
.count());
Assertions
.assertEquals(
"NL", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::fe4903425d9040f680d8610d9079ea14"))
.collect()
.get(0)
.getCountry()
.getClassid());
Assertions
.assertEquals(
"Netherlands", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::fe4903425d9040f680d8610d9079ea14"))
.collect()
.get(0)
.getCountry()
.getClassname());
Assertions
.assertEquals(
"IT", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|opendoar____::f0dd4a99fba6075a9494772b58f95280"))
.collect()
.get(0)
.getCountry()
.getClassid());
Assertions
.assertEquals(
"Italy", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|opendoar____::f0dd4a99fba6075a9494772b58f95280"))
.collect()
.get(0)
.getCountry()
.getClassname());
Assertions
.assertEquals(
"FR", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539"))
.collect()
.get(0)
.getCountry()
.getClassid());
Assertions
.assertEquals(
"France", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539"))
.collect()
.get(0)
.getCountry()
.getClassname());
tmp.foreach(e -> System.out.println(OBJECT_MAPPER.writeValueAsString(e)));
}
void verifyResultCountrySet(){
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultCountrySet> tmp = sc
.textFile(workingDir.toString() + "/country/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultCountrySet.class));
Assertions.assertEquals(5, tmp.count());
ResultCountrySet rc = tmp
.filter(r -> r.getResultId().equals("50|06cdd3ff4700::49ec404cee4e1452808aabeaffbd3072"))
.collect()
.get(0);
Assertions.assertEquals(1, rc.getCountrySet().size());
Assertions.assertEquals("NL", rc.getCountrySet().get(0).getClassid());
Assertions.assertEquals("Netherlands", rc.getCountrySet().get(0).getClassname());
rc = tmp
.filter(r -> r.getResultId().equals("50|07b5c0ccd4fe::e7f5459cc97865f2af6e3da964c1250b"))
.collect()
.get(0);
Assertions.assertEquals(1, rc.getCountrySet().size());
Assertions.assertEquals("NL", rc.getCountrySet().get(0).getClassid());
Assertions.assertEquals("Netherlands", rc.getCountrySet().get(0).getClassname());
rc = tmp
.filter(r -> r.getResultId().equals("50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6"))
.collect()
.get(0);
Assertions.assertEquals(2, rc.getCountrySet().size());
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("FR") && cs.getClassname().equals("France")));
rc = tmp
.filter(r -> r.getResultId().equals("50|355e65625b88::74009c567c81b4aa55c813db658734df"))
.collect()
.get(0);
Assertions.assertEquals(2, rc.getCountrySet().size());
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("NL") && cs.getClassname().equals("Netherlands")));
rc = tmp
.filter(r -> r.getResultId().equals("50|355e65625b88::54a1c76f520bb2c8da27d12e42891088"))
.collect()
.get(0);
Assertions.assertEquals(2, rc.getCountrySet().size());
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("FR") && cs.getClassname().equals("France")));
}
void verifyPropagationPublication(){
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/country/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(12, tmp.count());
Assertions.assertEquals(5, tmp.filter(r -> r.getCountry().size() > 0).count());
tmp
.foreach(
r -> r.getCountry().stream().forEach(c -> Assertions.assertEquals("dnet:countries", c.getSchemeid())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(c -> Assertions.assertEquals("dnet:countries", c.getSchemename())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(c -> Assertions.assertFalse(c.getDataInfo().getDeletedbyinference())));
tmp.foreach(r -> r.getCountry().stream().forEach(c -> Assertions.assertFalse(c.getDataInfo().getInvisible())));
tmp.foreach(r -> r.getCountry().stream().forEach(c -> Assertions.assertTrue(c.getDataInfo().getInferred())));
tmp
.foreach(
r -> r.getCountry().stream().forEach(c -> Assertions.assertEquals("0.85", c.getDataInfo().getTrust())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(c -> Assertions.assertEquals("propagation", c.getDataInfo().getInferenceprovenance())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(
c -> Assertions
.assertEquals("country:instrepos", c.getDataInfo().getProvenanceaction().getClassid())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(
c -> Assertions
.assertEquals(
"dnet:provenanceActions", c.getDataInfo().getProvenanceaction().getSchemeid())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(
c -> Assertions
.assertEquals(
"dnet:provenanceActions", c.getDataInfo().getProvenanceaction().getSchemename())));
List<Country> countries = tmp
.filter(r -> r.getId().equals("50|06cdd3ff4700::49ec404cee4e1452808aabeaffbd3072"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(1, countries.size());
Assertions.assertEquals("NL", countries.get(0).getClassid());
Assertions.assertEquals("Netherlands", countries.get(0).getClassname());
countries = tmp
.filter(r -> r.getId().equals("50|07b5c0ccd4fe::e7f5459cc97865f2af6e3da964c1250b"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(1, countries.size());
Assertions.assertEquals("NL", countries.get(0).getClassid());
Assertions.assertEquals("Netherlands", countries.get(0).getClassname());
countries = tmp
.filter(r -> r.getId().equals("50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(2, countries.size());
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("FR") && cs.getClassname().equals("France")));
countries = tmp
.filter(r -> r.getId().equals("50|355e65625b88::74009c567c81b4aa55c813db658734df"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(2, countries.size());
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
countries
.stream()
.anyMatch(cs -> cs.getClassid().equals("NL") && cs.getClassname().equals("Netherlands")));
countries = tmp
.filter(r -> r.getId().equals("50|355e65625b88::54a1c76f520bb2c8da27d12e42891088"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(2, countries.size());
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("FR") && cs.getClassname().equals("France")));
}
void verifyPropagationSoftware(){
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/source/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(10, tmp.count());
Dataset<Software> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Software.class));
Assertions.assertEquals(6, verificationDs.filter("size(country) > 0").count());
Assertions.assertEquals(3, verificationDs.filter("size(country) = 1").count());
Assertions.assertEquals(3, verificationDs.filter("size(country) = 2").count());
Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count());
Dataset<String> countryExploded = verificationDs
.flatMap(
(FlatMapFunction<Software, Country>) row -> row.getCountry().iterator(), Encoders.bean(Country.class))
.map((MapFunction<Country, String>) Qualifier::getClassid, Encoders.STRING());
Assertions.assertEquals(9, countryExploded.count());
Assertions.assertEquals(1, countryExploded.filter("value = 'FR'").count());
Assertions.assertEquals(1, countryExploded.filter("value = 'TR'").count());
Assertions.assertEquals(2, countryExploded.filter("value = 'IT'").count());
Assertions.assertEquals(1, countryExploded.filter("value = 'US'").count());
Assertions.assertEquals(1, countryExploded.filter("value = 'MX'").count());
Assertions.assertEquals(1, countryExploded.filter("value = 'CH'").count());
Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count());
Dataset<Tuple2<String, String>> countryExplodedWithCountryclassid = verificationDs
.flatMap((FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList<>();
List<Country> countryList = row.getCountry();
countryList
.forEach(
c -> prova
.add(
new Tuple2<>(
row.getId(), c.getClassid())));
return prova.iterator();
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Assertions.assertEquals(9, countryExplodedWithCountryclassid.count());
//countryExplodedWithCountryclassid.show(false);
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'FR' ")
.count());
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'TR' ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'IT' or _2 = 'MX') ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'IT' or _2 = 'US') ")
.count());
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'JP'")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'CH' or _2 = 'JP') ")
.count());
Dataset<Tuple2<String, String>> countryExplodedWithCountryclassname = verificationDs
.flatMap(
(FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList<>();
List<Country> countryList = row.getCountry();
countryList
.forEach(
c -> prova
.add(
new Tuple2<>(
row.getId(),
c.getClassname())));
return prova.iterator();
},
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
//countryExplodedWithCountryclassname.show(false);
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'France' ")
.count());
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'Turkey' ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'Italy' or _2 = 'Mexico') ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'Italy' or _2 = 'United States') ")
.count());
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'Japan' ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'Switzerland' or _2 = 'Japan') ")
.count());
Dataset<Tuple2<String, String>> countryExplodedWithCountryProvenance = verificationDs
.flatMap(
(FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList<>();
List<Country> countryList = row.getCountry();
countryList
.forEach(
c -> prova
.add(
new Tuple2<>(
row.getId(),
c
.getDataInfo()
.getInferenceprovenance())));
return prova.iterator();
},
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Assertions
.assertEquals(
7, countryExplodedWithCountryProvenance.filter("_2 = 'propagation'").count());
}
}

View File

@ -7,6 +7,7 @@ import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.antlr.v4.runtime.misc.Utils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -69,26 +70,35 @@ public class CountryPropagationJobTest {
@Test @Test
void testCountryPropagationSoftware() throws Exception { void testCountryPropagationSoftware() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/software") final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
.getPath(); sc
final String preparedInfoPath = getClass() .textFile(
.getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo/software") getClass()
.getPath(); .getResource("/eu/dnetlib/dhp/countrypropagation/graph/software")
SparkCountryPropagationJob .getPath()).saveAsTextFile(workingDir.toString() + "/source/software");
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo/software")
.getPath()).saveAsTextFile(workingDir.toString() + "/preparedInfo/software");
SparkCountryPropagationJob
.main( .main(
new String[] { new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(), "--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath, "--sourcePath",workingDir.toString() + "/source/software",
"-resultTableName", Software.class.getCanonicalName(), "-resultTableName", Software.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/software", "-workingPath", workingDir.toString(),
"-preparedInfoPath", preparedInfoPath "-resultType", "software"
}); });
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/software") .textFile(workingDir.toString() + "/source/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class)); .map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(10, tmp.count()); Assertions.assertEquals(10, tmp.count());
@ -130,7 +140,7 @@ public class CountryPropagationJobTest {
Assertions.assertEquals(9, countryExplodedWithCountryclassid.count()); Assertions.assertEquals(9, countryExplodedWithCountryclassid.count());
countryExplodedWithCountryclassid.show(false); //countryExplodedWithCountryclassid.show(false);
Assertions Assertions
.assertEquals( .assertEquals(
1, 1,
@ -190,7 +200,7 @@ public class CountryPropagationJobTest {
}, },
Encoders.tuple(Encoders.STRING(), Encoders.STRING())); Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
countryExplodedWithCountryclassname.show(false); //countryExplodedWithCountryclassname.show(false);
Assertions Assertions
.assertEquals( .assertEquals(
1, 1,
@ -259,23 +269,31 @@ public class CountryPropagationJobTest {
@Test @Test
void testCountryPropagationPublication() throws Exception { void testCountryPropagationPublication() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication")
.getPath();
final String preparedInfoPath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo/publication")
.getPath();
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-resultTableName", Publication.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/publication",
"-preparedInfoPath", preparedInfoPath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication")
.getPath()).saveAsTextFile(workingDir.toString() + "/source/publication");
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo/publication")
.getPath()).saveAsTextFile(workingDir.toString() + "/preparedInfo/publication");
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath",workingDir.toString() + "/source/publication",
"-resultTableName", Publication.class.getCanonicalName(),
"-workingPath", workingDir.toString(),
"-resultType", "publication"
});
JavaRDD<Publication> tmp = sc JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication") .textFile(workingDir.toString() + "/publication")

View File

@ -5,6 +5,7 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -63,7 +64,7 @@ public class DatasourceCountryPreparationTest {
new String[] { new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(), "--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath, "--sourcePath", sourcePath,
"--outputPath", workingDir.toString() + "/datasourceCountry", "--workingPath", workingDir.toString() + "/country",
"--allowedtypes", "pubsrepository::institutional", "--allowedtypes", "pubsrepository::institutional",
"--whitelist", "--whitelist",
"10|openaire____::3795d6478e30e2c9f787d427ff160944;10|opendoar____::16e6a3326dd7d868cbc926602a61e4d0;10|eurocrisdris::fe4903425d9040f680d8610d9079ea14;10|openaire____::5b76240cc27a58c6f7ceef7d8c36660e;10|openaire____::172bbccecf8fca44ab6a6653e84cb92a;10|openaire____::149c6590f8a06b46314eed77bfca693f;10|eurocrisdris::a6026877c1a174d60f81fd71f62df1c1;10|openaire____::4692342f0992d91f9e705c26959f09e0;10|openaire____::8d529dbb05ec0284662b391789e8ae2a;10|openaire____::345c9d171ef3c5d706d08041d506428c;10|opendoar____::1c1d4df596d01da60385f0bb17a4a9e0;10|opendoar____::7a614fd06c325499f1680b9896beedeb;10|opendoar____::1ee3dfcd8a0645a25a35977997223d22;10|opendoar____::d296c101daa88a51f6ca8cfc1ac79b50;10|opendoar____::798ed7d4ee7138d49b8828958048130a;10|openaire____::c9d2209ecc4d45ba7b4ca7597acb88a2;10|eurocrisdris::c49e0fe4b9ba7b7fab717d1f0f0a674d;10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539;10|eurocrisdris::432ca599953ff50cd4eeffe22faf3e48" "10|openaire____::3795d6478e30e2c9f787d427ff160944;10|opendoar____::16e6a3326dd7d868cbc926602a61e4d0;10|eurocrisdris::fe4903425d9040f680d8610d9079ea14;10|openaire____::5b76240cc27a58c6f7ceef7d8c36660e;10|openaire____::172bbccecf8fca44ab6a6653e84cb92a;10|openaire____::149c6590f8a06b46314eed77bfca693f;10|eurocrisdris::a6026877c1a174d60f81fd71f62df1c1;10|openaire____::4692342f0992d91f9e705c26959f09e0;10|openaire____::8d529dbb05ec0284662b391789e8ae2a;10|openaire____::345c9d171ef3c5d706d08041d506428c;10|opendoar____::1c1d4df596d01da60385f0bb17a4a9e0;10|opendoar____::7a614fd06c325499f1680b9896beedeb;10|opendoar____::1ee3dfcd8a0645a25a35977997223d22;10|opendoar____::d296c101daa88a51f6ca8cfc1ac79b50;10|opendoar____::798ed7d4ee7138d49b8828958048130a;10|openaire____::c9d2209ecc4d45ba7b4ca7597acb88a2;10|eurocrisdris::c49e0fe4b9ba7b7fab717d1f0f0a674d;10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539;10|eurocrisdris::432ca599953ff50cd4eeffe22faf3e48"
@ -72,7 +73,7 @@ public class DatasourceCountryPreparationTest {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<DatasourceCountry> tmp = sc JavaRDD<DatasourceCountry> tmp = sc
.textFile(workingDir.toString() + "/datasourceCountry") .textFile(workingDir.toString() + "/country/datasourceCountry")
.map(item -> OBJECT_MAPPER.readValue(item, DatasourceCountry.class)); .map(item -> OBJECT_MAPPER.readValue(item, DatasourceCountry.class));
Assertions.assertEquals(3, tmp.count()); Assertions.assertEquals(3, tmp.count());

View File

@ -1,12 +1,11 @@
package eu.dnetlib.dhp.countrypropagation; package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.isSparkSessionManaged;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -61,25 +60,25 @@ public class ResultCountryPreparationTest {
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication") .getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication")
.getPath(); .getPath();
final String preparedInfoPath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/datasourcecountry")
.getPath();
PrepareResultCountrySet
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--workingPath", workingDir.toString() + "/working",
"--sourcePath", sourcePath,
"--outputPath", workingDir.toString() + "/resultCountry",
"--preparedInfoPath", preparedInfoPath,
"--resultTableName", Publication.class.getCanonicalName()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc.textFile(getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/datasourcecountry")
.getPath()).saveAsTextFile(workingDir+"/country/datasourceCountry"); ;
PrepareResultCountrySet
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--workingPath", workingDir.toString() + "/country",
"--sourcePath", sourcePath,
"--resultTableName", Publication.class.getCanonicalName()
});
JavaRDD<ResultCountrySet> tmp = sc JavaRDD<ResultCountrySet> tmp = sc
.textFile(workingDir.toString() + "/resultCountry") .textFile(workingDir.toString() + "/country/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultCountrySet.class)); .map(item -> OBJECT_MAPPER.readValue(item, ResultCountrySet.class));
Assertions.assertEquals(5, tmp.count()); Assertions.assertEquals(5, tmp.count());

View File

@ -80,7 +80,6 @@ public class OrcidPropagationJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath, "-sourcePath", sourcePath,
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-saveGraph", "true",
"-resultTableName", Dataset.class.getCanonicalName(), "-resultTableName", Dataset.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/dataset", "-outputPath", workingDir.toString() + "/dataset",
"-possibleUpdatesPath", possibleUpdatesPath "-possibleUpdatesPath", possibleUpdatesPath
@ -125,8 +124,6 @@ public class OrcidPropagationJobTest {
.getPath(), .getPath(),
"-hive_metastore_uris", "-hive_metastore_uris",
"", "",
"-saveGraph",
"true",
"-resultTableName", "-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", "-outputPath",
@ -193,8 +190,6 @@ public class OrcidPropagationJobTest {
.getPath(), .getPath(),
"-hive_metastore_uris", "-hive_metastore_uris",
"", "",
"-saveGraph",
"true",
"-resultTableName", "-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath", "-outputPath",

View File

@ -0,0 +1,207 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.neethi.Assertion;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareStep1Test {
private static final Logger log = LoggerFactory.getLogger(PrepareStep1Test.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PrepareStep1Test.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PrepareStep1Test.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
spark = SparkSession
.builder()
.appName(PrepareStep1Test.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void noMatchTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparestep1")
.getPath();
PrepareResultOrcidAssociationStep1
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-resultTableName", Dataset.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-allowedsemrels", "IsSupplementedBy;IsSupplementTo",
"-allowedpids", "orcid;orcid_pending"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultOrcidList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
Assertions.assertEquals(0, tmp.count());
Assertions
.assertEquals(
7, sc
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
.count());
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.count());
}
@Test
void matchTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparestep1")
.getPath();
PrepareResultOrcidAssociationStep1
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-resultTableName", Publication.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-allowedsemrels", "IsSupplementedBy;IsSupplementTo",
"-allowedpids", "orcid;orcid_pending"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultOrcidList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
Assertions.assertEquals(1, tmp.count());
tmp.foreach(e -> System.out.println(OBJECT_MAPPER.writeValueAsString(e)));
Assertions
.assertEquals(
1, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.count());
Assertions
.assertEquals(
1, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.size());
Assertions
.assertEquals(
"0000-0002-5001-6911",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getOrcid());
Assertions
.assertEquals(
"Barbarić-Mikočević, Željka",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getFullname());
Assertions
.assertEquals(
"Željka",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getName());
Assertions
.assertEquals(
"Barbarić-Mikočević",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getSurname());
Assertions
.assertEquals(
7, sc
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
.count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class))
.count());
}
}

View File

@ -0,0 +1,222 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareStep2Test {
private static final Logger log = LoggerFactory.getLogger(PrepareStep2Test.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PrepareStep2Test.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PrepareStep2Test.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
spark = SparkSession
.builder()
.appName(PrepareStep2Test.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testMatch() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/resultSubset")
.getPath();
PrepareResultOrcidAssociationStep2
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/preparedInfo/mergedOrcidAssoc"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultOrcidList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/mergedOrcidAssoc")
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
Assertions.assertEquals(1, tmp.count());
Assertions
.assertEquals(
1,
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.count());
Assertions
.assertEquals(
2, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.size());
Assertions
.assertTrue(
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.stream()
.anyMatch(aa -> aa.getOrcid().equals("0000-0002-1234-5678")));
Assertions
.assertTrue(
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.stream()
.anyMatch(aa -> aa.getOrcid().equals("0000-0002-5001-6911")));
}
@Test
void matchTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparestep1")
.getPath();
PrepareResultOrcidAssociationStep1
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-resultTableName", Publication.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-allowedsemrels", "IsSupplementedBy;IsSupplementTo",
"-allowedpids", "orcid;orcid_pending"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultOrcidList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
Assertions.assertEquals(1, tmp.count());
tmp.foreach(e -> System.out.println(OBJECT_MAPPER.writeValueAsString(e)));
Assertions
.assertEquals(
1, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.count());
Assertions
.assertEquals(
1, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.size());
Assertions
.assertEquals(
"0000-0002-5001-6911",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getOrcid());
Assertions
.assertEquals(
"Barbarić-Mikočević, Željka",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getFullname());
Assertions
.assertEquals(
"Željka",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getName());
Assertions
.assertEquals(
"Barbarić-Mikočević",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getSurname());
Assertions
.assertEquals(
7, sc
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
.count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class))
.count());
}
}

View File

@ -33,32 +33,32 @@ public class ProjectPropagationJobTest {
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
private static final SparkConf conf = new SparkConf();
@BeforeAll @BeforeAll
public static void beforeAll() throws IOException { public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir); log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(ProjectPropagationJobTest.class.getSimpleName()); conf.setAppName(ProjectPropagationJobTest.class.getSimpleName());
conf.setMaster("local[*]"); conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost"); conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true"); conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false"); conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(ProjectPropagationJobTest.class.getSimpleName()) .appName(ProjectPropagationJobTest.class.getSimpleName())
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
} }
@AfterAll @AfterAll
public static void afterAll() throws IOException { public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop(); spark.stop();
} }
@ -71,6 +71,7 @@ public class ProjectPropagationJobTest {
@Test @Test
void NoUpdateTest() throws Exception { void NoUpdateTest() throws Exception {
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
final String potentialUpdateDate = getClass() final String potentialUpdateDate = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates") "/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates")
@ -82,10 +83,10 @@ public class ProjectPropagationJobTest {
SparkResultToProjectThroughSemRelJob SparkResultToProjectThroughSemRelJob
.main( .main(
new String[] { new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation", "-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdateDate, "-potentialUpdatePath", potentialUpdateDate,
"-alreadyLinkedPath", alreadyLinkedPath, "-alreadyLinkedPath", alreadyLinkedPath,
@ -98,6 +99,10 @@ public class ProjectPropagationJobTest {
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); .map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(0, tmp.count()); Assertions.assertEquals(0, tmp.count());
FileUtils.deleteDirectory(workingDir.toFile());
} }
/** /**
@ -107,6 +112,12 @@ public class ProjectPropagationJobTest {
*/ */
@Test @Test
void UpdateTenTest() throws Exception { void UpdateTenTest() throws Exception {
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
spark = SparkSession
.builder()
.appName(ProjectPropagationJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
final String potentialUpdatePath = getClass() final String potentialUpdatePath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates") "/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates")
@ -118,10 +129,10 @@ public class ProjectPropagationJobTest {
SparkResultToProjectThroughSemRelJob SparkResultToProjectThroughSemRelJob
.main( .main(
new String[] { new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation", "-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdatePath, "-potentialUpdatePath", potentialUpdatePath,
"-alreadyLinkedPath", alreadyLinkedPath, "-alreadyLinkedPath", alreadyLinkedPath,
@ -169,6 +180,9 @@ public class ProjectPropagationJobTest {
.sql( .sql(
"Select * from temporary where datainfo.inferenceprovenance = 'propagation'") "Select * from temporary where datainfo.inferenceprovenance = 'propagation'")
.count()); .count());
FileUtils.deleteDirectory(workingDir.toFile());
} }
/** /**
@ -179,6 +193,12 @@ public class ProjectPropagationJobTest {
*/ */
@Test @Test
void UpdateMixTest() throws Exception { void UpdateMixTest() throws Exception {
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
spark = SparkSession
.builder()
.appName(ProjectPropagationJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
final String potentialUpdatepath = getClass() final String potentialUpdatepath = getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates") "/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates")
@ -190,10 +210,10 @@ public class ProjectPropagationJobTest {
SparkResultToProjectThroughSemRelJob SparkResultToProjectThroughSemRelJob
.main( .main(
new String[] { new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(), "-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "", "-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation", "-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdatepath, "-potentialUpdatePath", potentialUpdatepath,
"-alreadyLinkedPath", alreadyLinkedPath, "-alreadyLinkedPath", alreadyLinkedPath,
@ -244,5 +264,7 @@ public class ProjectPropagationJobTest {
.sql( .sql(
"Select * from temporary where datainfo.inferenceprovenance = 'propagation'") "Select * from temporary where datainfo.inferenceprovenance = 'propagation'")
.count()); .count());
FileUtils.deleteDirectory(workingDir.toFile());
} }
} }

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"resultId":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","authorList":[{"name":"Željka","surname":"Barbarić-Mikočević","fullname":"Barbarić-Mikočević, Željka","orcid":"0000-0002-5001-6911"}]}

View File

@ -0,0 +1 @@
{"resultId":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","authorList":[{"name":"Vesna","surname":"Džimbeg-Malčić","fullname":"Džimbeg-Malčić, Vesna","orcid":"0000-0002-1234-5678"}]}

View File

@ -0,0 +1,18 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementedBy","relType":"datasourceOrganization","source":"50|57a035e5b1ae::0637d444355058eb76ab6d7a842aa8b4","subRelType":"provision","target":"50|475c1990cbb2::02d3c300ac2d07135a6208159c512f62","validated":false}
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementedBy","relType":"datasourceOrganization","source":"50|57a035e5b1ae::01894f77220771428abaecbfa2bcc8f7","subRelType":"provision","target":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","validated":false}
{"collectedfrom":[{"key":"10|openaire____::6ac933301a3933c8a22ceebea7000326","value":"Academy of Finland"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementTo","relType":"projectOrganization","source":"50|475c1990cbb2::02d3c300ac2d07135a6208159c512f62","subRelType":"participation","target":"50|57a035e5b1ae::0637d444355058eb76ab6d7a842aa8b4","validated":false}
{"collectedfrom":[{"key":"10|openaire____::6ac933301a3933c8a22ceebea7000326","value":"Academy of Finland"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementTo","relType":"projectOrganization","source":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","subRelType":"participation","target":"50|57a035e5b1ae::01894f77220771428abaecbfa2bcc8f7","validated":false}
{"collectedfrom":[{"key":"10|openaire____::457528c43fabd74e212db2ed61101075","value":"Agence Nationale de la Recherche"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementedBy","relType":"projectOrganization","source":"50|57a035e5b1ae::07b10647d24e46073785210d4715f4e9","subRelType":"participation","target":"50|475c1990cbb2::699e01797642d72238c502ffcae18277","validated":false}
{"collectedfrom":[{"key":"10|openaire____::457528c43fabd74e212db2ed61101075","value":"Agence Nationale de la Recherche"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"IsSupplementedBy","relType":"projectOrganization","source":"50|57a035e5b1ae::0cee1d69f1cab270c382eaa853bcf4dc","subRelType":"participation","target":"50|475c1990cbb2::b778659ec5014f3db4c4e03c7907a69d","validated":false}
{"collectedfrom":[{"key":"10|openaire____::457528c43fabd74e212db2ed61101075","value":"Agence Nationale de la Recherche"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"IsSupplementTo","relType":"projectOrganization","source":"50|57a035e5b1ae::0d428b3119b0c822270df15058029172","subRelType":"participation","target":"50|475c1990cbb2::c8172336a860b66965e8d43a5494de2c","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::27b677f5d4a8b3a1159dba624016dc70","subRelType":"participation","target":"20|corda_______::0790e5c820c6a795d2b7524415cefb53","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::b5db617bb0f475b49584f5ee5120227c","subRelType":"participation","target":"20|corda_______::16220fe1781e3beb748872d31aa7f789","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::2907ce789238006cbe07f3e89820c9df","subRelType":"participation","target":"20|corda_______::43edcb7ca35d487ec357959e05c7ed7b","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::d185f413b046d7a7b15808388dad71a5","subRelType":"participation","target":"20|corda_______::46ac0acd65a3c66b10842bf291be9660","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::9c454e23267b520b621199fd4a79e3a6","subRelType":"participation","target":"20|corda_______::86fa29ae6a36610616e1691e1283f807","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::e40925978874b5f57378f301370e1293","subRelType":"participation","target":"20|corda_______::88e4a05f9c42a4830ffdd51663ed4538","validated":false}
{"collectedfrom":[{"key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8","value":"OpenOrgs Database"}],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.990"},"lastupdatetimestamp":1649252022894,"properties":[],"relClass":"merges","relType":"organizationOrganization","source":"20|pending_org_::5a01343420bc742ec1891cd98c36a258","subRelType":"dedup","target":"20|corda_______::a7468d48c5f0517ec67a2a9163af7150","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::531cfba3fa5e10f6be1e42e3c54cc95f","subRelType":"participation","target":"20|corda_______::b2233c6930da222c40e78302385a277d","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::48cb178c2561829bc2eedd787c052d48","subRelType":"participation","target":"20|corda_______::cd8ad1c4f710b667b74362c1674b92e6","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::795be98a5ba4c9190a32fc56033a9540","subRelType":"participation","target":"20|corda_______::f2323f9ed70f0f3a93fdfbb92f715e0e","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::ca5b255e4b2ef49ff424e0019962591c","subRelType":"participation","target":"20|corda_______::f2323f9ed70f0f3a93fdfbb92f715e0e","validated":false}