Compare commits

...

17 Commits

Author SHA1 Message Date
Miriam Baglioni 481c4e28d1 [Enrichment single step] changed test for country propagation and fixed inconsistencies 2022-11-24 09:36:10 +01:00
Miriam Baglioni 2e6af7c655 mergin with branch beta 2022-11-23 11:31:35 +01:00
Miriam Baglioni de9d0ace38 [Enrichment single step] modification of workflow ans some change in the classes 2022-11-23 09:54:50 +01:00
Miriam Baglioni b0969461f8 mergin with branch beta 2022-11-22 16:54:23 +01:00
Miriam Baglioni 1e233bedf6 [Enrichment] - 2022-04-19 12:32:22 +02:00
Miriam Baglioni 30e0f60ac8 [Enrichment Step] get rid of hive 2022-04-14 08:50:37 +02:00
Miriam Baglioni 7501e823ed [Enrichment Step] get rid of hive 2022-04-13 17:46:22 +02:00
Miriam Baglioni d205bf78d8 [Enrichment Step] issue of NPE on author should be fixed 2022-04-13 14:39:13 +02:00
Miriam Baglioni 550e1a4e33 [Enrichment Step] issue of NPE on author should be fixed 2022-04-13 14:34:48 +02:00
Miriam Baglioni 8a39a85a5f [Enrichment WF] fixed issue in wf 2022-04-13 12:30:03 +02:00
Miriam Baglioni d1519fa28f [Enrichment Step] get rid of hive 2022-04-13 11:48:03 +02:00
Miriam Baglioni aecea5a095 mergin with branch beta 2022-04-12 12:57:31 +02:00
Miriam Baglioni 1a8641227d [Enrichment Step] get rid of hive 2022-04-12 11:26:48 +02:00
Miriam Baglioni 157e6bf5e1 [Enrichment Step] get rid of hive 2022-04-11 14:09:55 +02:00
Miriam Baglioni a6c26a9e0e [Enrichment WF] third attempt to make it run in a single step 2022-04-11 09:58:39 +02:00
Miriam Baglioni 9bd5310112 [Enrichment WF] second attempt to make it run in a single step 2022-04-07 18:58:55 +02:00
Miriam Baglioni 7406c88276 [Enrichment WF] first attempt to make it run in a single step 2022-04-06 18:14:05 +02:00
74 changed files with 3645 additions and 1266 deletions

View File

@ -0,0 +1,982 @@
{
"indexed": {
"date-parts": [
[
2022,
4,
5
]
],
"date-time": "2022-04-05T11:07:00Z",
"timestamp": 1649156820730
},
"reference-count": 63,
"publisher": "Public Library of Science (PLoS)",
"issue": "5",
"license": [
{
"start": {
"date-parts": [
[
2020,
5,
29
]
],
"date-time": "2020-05-29T00:00:00Z",
"timestamp": 1590710400000
},
"content-version": "vor",
"delay-in-days": 0,
"URL": "http://creativecommons.org/licenses/by/4.0/"
}
],
"funder": [
{
"DOI": "10.13039/501100001602",
"name": "Science Foundation Ireland",
"doi-asserted-by": "crossref",
"award": [
"SFI/12/RC/2273"
]
},
{
"DOI": "10.13039/501100001602",
"name": "Science Foundation Ireland",
"doi-asserted-by": "crossref",
"award": [
"SFI/12/RC/2273"
]
},
{
"DOI": "10.13039/501100001602",
"name": "Science Foundation Ireland",
"doi-asserted-by": "crossref",
"award": [
"SFI/12/RC/2273"
]
},
{
"DOI": "10.13039/501100001602",
"name": "Science Foundation Ireland",
"doi-asserted-by": "crossref",
"award": [
"SFI/12/RC/2273"
]
}
],
"content-domain": {
"domain": [
"www.plosone.org"
],
"crossmark-restriction": false
},
"short-container-title": [
"PLoS ONE"
],
"DOI": "10.1371/journal.pone.0233284",
"type": "journal-article",
"created": {
"date-parts": [
[
2020,
5,
29
]
],
"date-time": "2020-05-29T17:53:47Z",
"timestamp": 1590774827000
},
"page": "e0233284",
"update-policy": "http://dx.doi.org/10.1371/journal.pone.corrections_policy",
"source": "Crossref",
"is-referenced-by-count": 13,
"title": [
"Vancomycin and nisin A are effective against biofilms of multi-drug resistant Staphylococcus aureus isolates from human milk"
],
"prefix": "10.1371",
"volume": "15",
"author": [
{
"ORCID": "http://orcid.org/0000-0003-4107-0278",
"authenticated-orcid": true,
"given": "Angeliki",
"family": "Angelopoulou",
"sequence": "first",
"affiliation": [
]
},
{
"given": "Des",
"family": "Field",
"sequence": "additional",
"affiliation": [
]
},
{
"given": "Mariana",
"family": "Pérez-Ibarreche",
"sequence": "additional",
"affiliation": [
]
},
{
"ORCID": "http://orcid.org/0000-0001-8317-6455",
"authenticated-orcid": true,
"given": "Alicja K.",
"family": "Warda",
"sequence": "additional",
"affiliation": [
]
},
{
"given": "Colin",
"family": "Hill",
"sequence": "additional",
"affiliation": [
]
},
{
"given": "R. Paul",
"family": "Ross",
"sequence": "additional",
"affiliation": [
]
}
],
"member": "340",
"published-online": {
"date-parts": [
[
2020,
5,
29
]
]
},
"reference": [
{
"issue": "78",
"key": "pone.0233284.ref001",
"doi-asserted-by": "crossref",
"first-page": "509",
"DOI": "10.1016/S0899-9007(00)00363-4",
"article-title": "Breast milk: a truly functional food",
"volume": "16",
"author": "B. Lönnerdal",
"year": "2000",
"journal-title": "Nutrition"
},
{
"key": "pone.0233284.ref002",
"first-page": "1",
"volume-title": "Mastitis: causes and management",
"author": "WHO",
"year": "2000"
},
{
"issue": "12",
"key": "pone.0233284.ref003",
"doi-asserted-by": "crossref",
"first-page": "115",
"DOI": "10.1038/pr.2014.178",
"article-title": "Human milk and infant intestinal mucosal glycans guide succession of the neonatal intestinal microbiota",
"volume": "77",
"author": "DS Newburg",
"year": "2015",
"journal-title": "Pediatr Res"
},
{
"issue": "7",
"key": "pone.0233284.ref004",
"doi-asserted-by": "crossref",
"first-page": "647",
"DOI": "10.1001/jamapediatrics.2017.0378",
"article-title": "Association between breast milk bacterial communities and establishment and development of the infant gut microbiome",
"volume": "171",
"author": "PS Pannaraj",
"year": "2017",
"journal-title": "JAMA Pediatr"
},
{
"issue": "Suppl 2:",
"key": "pone.0233284.ref005",
"doi-asserted-by": "crossref",
"first-page": "S69",
"DOI": "10.1542/peds.2008-1315i",
"article-title": "Why mothers stop breastfeeding: mothers' self-reported reasons for stopping during the first year",
"volume": "122",
"author": "R Li",
"year": "2008",
"journal-title": "Pediatrics"
},
{
"issue": "2",
"key": "pone.0233284.ref006",
"doi-asserted-by": "crossref",
"first-page": "83",
"DOI": "10.1007/s00430-017-0532-z",
"article-title": "The microbiology and treatment of human mastitis",
"volume": "207",
"author": "A Angelopoulou",
"year": "2018",
"journal-title": "Med Microbiol Immunol"
},
{
"issue": "2",
"key": "pone.0233284.ref007",
"doi-asserted-by": "crossref",
"first-page": "169",
"DOI": "10.3920/BM2013.0036",
"article-title": "Probiotics for human lactational mastitis",
"volume": "5",
"author": "L Fernández",
"year": "2014",
"journal-title": "Benef Microbes"
},
{
"issue": "15",
"key": "pone.0233284.ref008",
"doi-asserted-by": "crossref",
"first-page": "4650",
"DOI": "10.1128/AEM.02599-07",
"article-title": "Oral administration of Lactobacillus strains isolated from breast milk as an alternative for the treatment of infectious mastitis during lactation",
"volume": "74",
"author": "E Jiménez",
"year": "2008",
"journal-title": "Appl Environ Microbiol"
},
{
"issue": "3",
"key": "pone.0233284.ref009",
"doi-asserted-by": "crossref",
"first-page": "406",
"DOI": "10.1177/0890334415585078",
"article-title": "Metagenomic analysis of milk of healthy and mastitis-suffering women",
"volume": "31",
"author": "E Jiménez",
"year": "2015",
"journal-title": "J Hum Lact"
},
{
"issue": "2",
"key": "pone.0233284.ref010",
"doi-asserted-by": "crossref",
"first-page": "176",
"DOI": "10.1086/589241",
"article-title": "Risk of infection and death due to methicillin-resistant Staphylococcus aureus in long-term carriers",
"volume": "47",
"author": "R Datta",
"year": "2008",
"journal-title": "Clin Infect Dis"
},
{
"issue": "4",
"key": "pone.0233284.ref011",
"doi-asserted-by": "crossref",
"DOI": "10.1128/microbiolspec.GPP3-0023-2018",
"article-title": "Staphylococcal biofilms",
"volume": "6",
"author": "M. Otto",
"year": "2018",
"journal-title": "Microbiol Spectr"
},
{
"issue": "1",
"key": "pone.0233284.ref012",
"doi-asserted-by": "crossref",
"first-page": "9",
"DOI": "10.1038/s41522-018-0053-6",
"article-title": "Fighting biofilms with lantibiotics and other groups of bacteriocins",
"volume": "4",
"author": "H Mathur",
"year": "2018",
"journal-title": "NPJ Biofilms Microbiomes"
},
{
"issue": "2",
"key": "pone.0233284.ref013",
"doi-asserted-by": "crossref",
"first-page": "310",
"DOI": "10.1128/MMBR.00041-08",
"article-title": "Signals, regulatory networks, and materials that build and break bacterial biofilms",
"volume": "73",
"author": "E Karatan",
"year": "2009",
"journal-title": "Microbiol Mol Biol Rev"
},
{
"issue": "3",
"key": "pone.0233284.ref014",
"doi-asserted-by": "crossref",
"first-page": "147",
"DOI": "10.1016/S1473-3099(01)00091-3",
"article-title": "Vancomycin-resistant Staphylococcus aureus: a new model of antibiotic resistance",
"volume": "1",
"author": "K Hiramatsu",
"year": "2001",
"journal-title": "Lancet Infect Dis"
},
{
"key": "pone.0233284.ref015",
"doi-asserted-by": "crossref",
"first-page": "339",
"DOI": "10.1146/annurev.mi.38.100184.002011",
"article-title": "The structure and mode of action of glycopeptide antibiotics of the vancomycin group",
"volume": "38",
"author": "JC Barna",
"year": "1984",
"journal-title": "Annu Rev Microbiol"
},
{
"key": "pone.0233284.ref016",
"unstructured": "Health Service Executive Mastitis Factsheet for Health Care Professionals. Available at: https://www.breastfeeding.ie/Uploads/Mastitis.pdf"
},
{
"issue": "3",
"key": "pone.0233284.ref017",
"first-page": "136",
"article-title": "Breast infection: a review of diagnosis and management practices",
"volume": "14",
"author": "E Boakes",
"year": "2018",
"journal-title": "Eur J Breast Health"
},
{
"key": "pone.0233284.ref018",
"doi-asserted-by": "crossref",
"first-page": "1205",
"DOI": "10.3389/fmicb.2017.01205",
"article-title": "Bacteriocin-antimicrobial synergy: A medical and food perspective",
"volume": "8",
"author": "H Mathur",
"year": "2017",
"journal-title": "Front Microbiol"
},
{
"issue": "1",
"key": "pone.0233284.ref019",
"doi-asserted-by": "crossref",
"first-page": "223",
"DOI": "10.1016/S0005-2736(99)00208-4",
"article-title": "The lantibiotic nisin, a special case or not?",
"volume": "1462",
"author": "E Breukink",
"year": "1999",
"journal-title": "Biochim Biophys Acta"
},
{
"issue": "10",
"key": "pone.0233284.ref020",
"doi-asserted-by": "crossref",
"first-page": "963",
"DOI": "10.1038/nsmb830",
"article-title": "The nisin-lipid II complex reveals a pyrophosphate cage that provides a blueprint for novel antibiotics",
"volume": "11",
"author": "ST Hsu",
"year": "2004",
"journal-title": "Nat Struct Mol Biol"
},
{
"issue": "5793",
"key": "pone.0233284.ref021",
"doi-asserted-by": "crossref",
"first-page": "1636",
"DOI": "10.1126/science.1129818",
"article-title": "An alternative bactericidal mechanism of action for lantibiotic peptides that target lipid II",
"volume": "313",
"author": "HE Hasper",
"year": "2006",
"journal-title": "Science"
},
{
"issue": "3",
"key": "pone.0233284.ref022",
"doi-asserted-by": "crossref",
"first-page": "1772",
"DOI": "10.1074/jbc.M006770200",
"article-title": "Specific binding of nisin to the peptidoglycan precursor lipid II combines pore formation and inhibition of cell wall biosynthesis for potent antibiotic activity",
"volume": "276",
"author": "I Wiedemann",
"year": "2001",
"journal-title": "J Biol Chem"
},
{
"key": "pone.0233284.ref023",
"doi-asserted-by": "crossref",
"first-page": "104539",
"DOI": "10.1016/j.idairyj.2019.104539",
"article-title": "Bovine mastitis is a polymicrobial disease requiring a polydiagnostic approach",
"volume": "99",
"author": "A Angelopoulou",
"year": "2019",
"journal-title": "Int Dairy J"
},
{
"issue": "4",
"key": "pone.0233284.ref024",
"doi-asserted-by": "crossref",
"first-page": "493",
"DOI": "10.1093/ajcp/45.4_ts.493",
"article-title": "Antibiotic susceptibility testing by a standardized single disk method",
"volume": "45",
"author": "AW Bauer",
"year": "1966",
"journal-title": "Am Journal Clin Pathol"
},
{
"key": "pone.0233284.ref025",
"unstructured": "v_9.0_Breakpoint_Tables.pdf. Available at: http://www.eucast.org/fileadmin/src/media/PDFs/EUCAST_files/Breakpoint_tables/v_9.0_Breakpoint_Tables.pdf (Accessed: 28th July 2019)."
},
{
"issue": "12",
"key": "pone.0233284.ref026",
"doi-asserted-by": "crossref",
"first-page": "67",
"DOI": "10.1016/j.vetmic.2010.05.044",
"article-title": "(GTG)5-PCR fingerprinting for the classification and identification of coagulase-negative Staphylococcus species from bovine milk and teat apices: a comparison of type strains and field isolates",
"volume": "147",
"author": "G Braem",
"year": "2011",
"journal-title": "Vet Microbiol"
},
{
"key": "pone.0233284.ref027",
"doi-asserted-by": "crossref",
"first-page": "270",
"DOI": "10.1186/s12859-015-0703-0",
"article-title": "GelJ a tool for analyzing DNA fingerprint gel images",
"volume": "16",
"author": "J Heras",
"year": "2015",
"journal-title": "BMC bioinformatics"
},
{
"issue": "4",
"key": "pone.0233284.ref028",
"doi-asserted-by": "crossref",
"first-page": "473",
"DOI": "10.1111/j.1751-7915.2010.00184.x",
"article-title": "Studies with bioengineered nisin peptides highlight the broad-spectrum potency of nisin V",
"volume": "3",
"author": "D Field",
"year": "2010",
"journal-title": "Microb Biotechnol"
},
{
"issue": "11",
"key": "pone.0233284.ref029",
"doi-asserted-by": "crossref",
"first-page": "e79563",
"DOI": "10.1371/journal.pone.0079563",
"article-title": "Intensive mutagenesis of the nisin hinge leads to the rational design of enhanced derivatives",
"volume": "8",
"author": "B Healy",
"year": "2013",
"journal-title": "PLoS One"
},
{
"issue": "10",
"key": "pone.0233284.ref030",
"doi-asserted-by": "crossref",
"first-page": "e46884",
"DOI": "10.1371/journal.pone.0046884",
"article-title": "Bioengineered nisin A derivatives with enhanced activity against both Gram positive and Gram negative pathogens",
"volume": "7",
"author": "D Field",
"year": "2012",
"journal-title": "PLoS One"
},
{
"issue": "3",
"key": "pone.0233284.ref031",
"doi-asserted-by": "crossref",
"first-page": "e0119684",
"DOI": "10.1371/journal.pone.0119684",
"article-title": "A Bioengineered nisin derivative to control biofilms of Staphylococcus pseudintermedius",
"volume": "10",
"author": "D Field",
"year": "2015a",
"journal-title": "PLoS One"
},
{
"issue": "2",
"key": "pone.0233284.ref032",
"doi-asserted-by": "crossref",
"first-page": "207",
"DOI": "10.1007/s13765-012-3253-4",
"article-title": "Biofilm formation, attachment, and cell hydrophobicity of foodborne pathogens under varied environmental conditions",
"volume": "56",
"author": "NY Choi",
"year": "2013",
"journal-title": "J Korean Soc Appl Biol Chem"
},
{
"issue": "2",
"key": "pone.0233284.ref033",
"doi-asserted-by": "crossref",
"first-page": "175",
"DOI": "10.1016/S0167-7012(00)00122-6",
"article-title": "A modified microtiter-plate test for quantification of staphylococcal biofilm formation",
"volume": "40",
"author": "S Stepanović",
"year": "2000",
"journal-title": "J Microbiol Methods"
},
{
"issue": "2",
"key": "pone.0233284.ref034",
"doi-asserted-by": "crossref",
"first-page": "225",
"DOI": "10.1111/j.1574-695X.2011.00806.x",
"article-title": "Characterization of Staphylococcus aureus strains involved in human and bovine mastitis",
"volume": "62",
"author": "S Delgado",
"year": "2011",
"journal-title": "FEMS Immunol Med Microbiol"
},
{
"key": "pone.0233284.ref035",
"doi-asserted-by": "crossref",
"first-page": "53",
"DOI": "10.2174/1874285801711010053",
"article-title": "Understanding the mechanism of bacterial biofilms resistance to antimicrobial agents",
"volume": "11",
"author": "S Singh",
"year": "2017",
"journal-title": "Open Microbiol J"
},
{
"issue": "1",
"key": "pone.0233284.ref036",
"doi-asserted-by": "crossref",
"first-page": "61",
"DOI": "10.2174/1389203053027584",
"article-title": "Bacterial lantibiotics: strategies to improve therapeutic potential",
"volume": "6",
"author": "PD Cotter",
"year": "2005",
"journal-title": "Curr Protein Pept Sci"
},
{
"issue": "5",
"key": "pone.0233284.ref037",
"doi-asserted-by": "crossref",
"first-page": "494",
"DOI": "10.1016/j.ijantimicag.2015.07.011",
"article-title": "Bacteriocins and their position in the next wave of conventional antibiotics",
"volume": "46",
"author": "VL Cavera",
"year": "2015",
"journal-title": "Int J Antimicrob Agents"
},
{
"key": "pone.0233284.ref038",
"doi-asserted-by": "crossref",
"first-page": "1363",
"DOI": "10.3389/fmicb.2015.01363",
"article-title": "Bioengineering lantibiotics for therapeutic success",
"volume": "6",
"author": "D Field",
"year": "2015b",
"journal-title": "Front Microbiol"
},
{
"issue": "11",
"key": "pone.0233284.ref039",
"doi-asserted-by": "crossref",
"first-page": "5572",
"DOI": "10.1128/AAC.00888-13",
"article-title": "Effects of bacteriocins on methicillin-resistant Staphylococcus aureus biofilm",
"volume": "57",
"author": "K Okuda",
"year": "2013",
"journal-title": "Antimicrob Agents Chemother"
},
{
"issue": "6",
"key": "pone.0233284.ref040",
"doi-asserted-by": "crossref",
"first-page": "511",
"DOI": "10.1159/000335598",
"article-title": "In vitro activities of nisin alone or in combination with vancomycin and ciprofloxacin against methicillin-resistant and methicillin-susceptible Staphylococcus aureus strains",
"volume": "57",
"author": "S Dosler",
"year": "2011",
"journal-title": "Chemotherapy"
},
{
"issue": "18",
"key": "pone.0233284.ref041",
"doi-asserted-by": "crossref",
"first-page": "5809",
"DOI": "10.1128/AEM.01104-07",
"article-title": "Dissection and modulation of the four distinct activities of nisin by mutagenesis of rings A and B and by C-terminal truncation",
"volume": "73",
"author": "R Rink",
"year": "2007",
"journal-title": "Appl Environ Microbiol"
},
{
"issue": "6",
"key": "pone.0233284.ref042",
"doi-asserted-by": "crossref",
"first-page": "806",
"DOI": "10.1007/s00253-004-1599-1",
"article-title": "Site-directed mutagenesis of the hinge region of nisinZ and properties of nisinZ mutants",
"volume": "64",
"author": "J Yuan",
"year": "2004",
"journal-title": "Appl Microbiol Biotechnol"
},
{
"key": "pone.0233284.ref043",
"doi-asserted-by": "crossref",
"first-page": "508",
"DOI": "10.3389/fmicb.2016.00508",
"article-title": "In vitro activities of nisin and nisin derivatives alone and in combination with antibiotics against Staphylococcus biofilms",
"volume": "7",
"author": "D Field",
"year": "2016",
"journal-title": "Front Microbiol"
},
{
"issue": "18",
"key": "pone.0233284.ref044",
"doi-asserted-by": "crossref",
"first-page": "1573",
"DOI": "10.2217/fmb-2019-0153",
"article-title": "Nisin Z and lacticin 3147 improve efficacy of antibiotics against clinically significant bacteria",
"volume": "14",
"author": "JC Ellis",
"year": "2020",
"journal-title": "Future Microbiol"
},
{
"issue": "3",
"key": "pone.0233284.ref045",
"doi-asserted-by": "crossref",
"first-page": "311",
"DOI": "10.1177/0890334408317435",
"article-title": "The bacteriocin nisin, an effective agent for the treatment of staphylococcal mastitis during lactation",
"volume": "24",
"author": "L Fernández",
"year": "2008",
"journal-title": "J Hum Lact"
},
{
"issue": "1",
"key": "pone.0233284.ref046",
"doi-asserted-by": "crossref",
"first-page": "33",
"DOI": "10.1159/000272223",
"article-title": "Inflammatory breast diseases during lactation: milk stasis, puerperal mastitis, abscesses of the breast, and malignant tumorscurrent and evidence-based strategies for diagnosis and therapy",
"volume": "5",
"author": "M Abou-Dakn",
"year": "2010",
"journal-title": "Breast Care"
},
{
"issue": "6",
"key": "pone.0233284.ref047",
"doi-asserted-by": "crossref",
"first-page": "430",
"DOI": "10.1007/s12262-012-0776-1",
"article-title": "Management of lactational mastitis and breast abscesses: review of current knowledge and practice",
"volume": "75",
"author": "K Kataria",
"year": "2013",
"journal-title": "Indian J Surg"
},
{
"issue": "2",
"key": "pone.0233284.ref048",
"doi-asserted-by": "crossref",
"first-page": "77",
"DOI": "10.1016/j.micres.2012.09.004",
"article-title": "Genotypic diversity and virulent factors of Staphylococcus epidermidis isolated from human breast milk",
"volume": "168",
"author": "J Begović",
"year": "2013",
"journal-title": "Microbiol Res"
},
{
"issue": "Pt 8",
"key": "pone.0233284.ref049",
"doi-asserted-by": "crossref",
"first-page": "761",
"DOI": "10.1099/jmm.0.05453-0",
"article-title": "Antimicrobial-resistance and enterotoxin-encoding genes among staphylococci isolated from expressed human breast milk",
"volume": "53",
"author": "LA Carneiro",
"year": "2004",
"journal-title": "J Med Microbiol"
},
{
"issue": "2",
"key": "pone.0233284.ref050",
"doi-asserted-by": "crossref",
"first-page": "113",
"DOI": "10.1007/s00284-015-0925-4",
"article-title": "Antibiotic susceptibility of commensal bacteria from human milk",
"volume": "72",
"author": "PW Chen",
"year": "2016",
"journal-title": "Curr Microbiol"
},
{
"key": "pone.0233284.ref051",
"doi-asserted-by": "crossref",
"first-page": "2512",
"DOI": "10.3389/fmicb.2018.02512",
"article-title": "Microbial community dynamics in mother's milk and infant's mouth and gut in moderately preterm infants",
"volume": "9",
"author": "E Biagi",
"year": "2018",
"journal-title": "Front Microbiol"
},
{
"key": "pone.0233284.ref052",
"doi-asserted-by": "crossref",
"first-page": "4",
"DOI": "10.3410/M4-4",
"article-title": "Reduced vancomycin susceptibility among clinical Staphylococcus aureus isolates ('the MIC Creep'): implications for therapy",
"volume": "4",
"author": "A Dhand",
"year": "2012",
"journal-title": "F1000 Med Rep"
},
{
"issue": "12",
"key": "pone.0233284.ref053",
"doi-asserted-by": "crossref",
"first-page": "1112",
"DOI": "10.1136/jcp.2009.069021",
"article-title": "Low concentrations of vancomycin stimulate biofilm formation in some clinical isolates of Staphylococcus epidermidis",
"volume": "62",
"author": "JS Cargill",
"year": "2009",
"journal-title": "J Clin Pathol"
},
{
"issue": "2",
"key": "pone.0233284.ref054",
"doi-asserted-by": "crossref",
"first-page": "191",
"DOI": "10.1002/jobm.201000221",
"article-title": "Effect of sub-lethal doses of vancomycin and oxacillin on biofilm formation by vancomycin intermediate resistant Staphylococcus aureus",
"volume": "51",
"author": "ZA Mirani",
"year": "2011",
"journal-title": "J Basic Microbiol"
},
{
"key": "pone.0233284.ref055",
"doi-asserted-by": "crossref",
"first-page": "225",
"DOI": "10.1016/j.micpath.2017.07.004",
"article-title": "Vancomycin-induced biofilm formation by methicillin-resistant Staphylococcus aureus is associated with the secretion of membrane vesicles",
"volume": "110",
"author": "X He",
"year": "2017",
"journal-title": "Microb Pathog"
},
{
"issue": "9",
"key": "pone.0233284.ref056",
"doi-asserted-by": "crossref",
"first-page": "1627",
"DOI": "10.4315/0362-028X.JFP-12-001",
"article-title": "Effects of nisin and lysozyme on growth inhibition and biofilm formation capacity of Staphylococcus aureus strains isolated from raw milk and cheese samples",
"volume": "75",
"author": "M Sudagidan",
"year": "2012",
"journal-title": "J Food Prot"
},
{
"issue": "3",
"key": "pone.0233284.ref057",
"doi-asserted-by": "crossref",
"first-page": "253",
"DOI": "10.1016/j.ijfoodmicro.2008.01.011",
"article-title": "Nisin-bacteriophage cross-resistance in Staphylococcus aureus",
"volume": "122",
"author": "B Martinez",
"year": "2008",
"journal-title": "Int J Food Microbiol"
},
{
"issue": "1",
"key": "pone.0233284.ref058",
"doi-asserted-by": "crossref",
"first-page": "82",
"DOI": "10.2146/ajhp080434",
"article-title": "Therapeutic monitoring of vancomycin in adult patients: a consensus review of the american society of health-system pharmacists, the infectious diseases society of america, and the society of infectious diseases pharmacists",
"volume": "66",
"author": "M Rybak",
"year": "2009",
"journal-title": "Am J Health Syst Pharm"
},
{
"issue": "2",
"key": "pone.0233284.ref059",
"doi-asserted-by": "crossref",
"first-page": "277",
"DOI": "10.1111/j.1574-695X.2007.00300.x",
"article-title": "Increased tolerance of Staphylococcus aureus to vancomycin in viscous media",
"volume": "51",
"author": "V Kostenko",
"year": "2007",
"journal-title": "FEMS Immunol Med Microbiol"
},
{
"key": "pone.0233284.ref060",
"first-page": "107",
"article-title": "Multidrug tolerance of biofilms and persister cells",
"volume": "322",
"author": "K. Lewis",
"year": "2008",
"journal-title": "Curr Top Microbiol Immunol"
},
{
"issue": "6",
"key": "pone.0233284.ref061",
"doi-asserted-by": "crossref",
"first-page": "ftw056",
"DOI": "10.1093/femspd/ftw056",
"article-title": "Penetration barrier contributes to bacterial biofilm-associated resistance against only select antibiotics, and exhibits genus-, strain- and antibiotic-specific differences",
"volume": "74",
"author": "R Singh",
"year": "2016",
"journal-title": "Pathog Dis"
},
{
"issue": "12",
"key": "pone.0233284.ref062",
"doi-asserted-by": "crossref",
"first-page": "7273",
"DOI": "10.1128/AAC.03132-14",
"article-title": "Extracellular DNA impedes the transport of vancomycin in Staphylococcus epidermidis biofilms preexposed to subinhibitory concentrations of vancomycin",
"volume": "58",
"author": "N Doroshenko",
"year": "2014",
"journal-title": "Antimicrob Agents Chemotherapy"
},
{
"issue": "1",
"key": "pone.0233284.ref063",
"doi-asserted-by": "crossref",
"first-page": "46",
"DOI": "10.1007/s00776-005-0968-7",
"article-title": "Antimicrobial susceptibility of Staphylococcus aureus and Staphylococcus epidermidis biofilms isolated from infected total hip arthroplasty cases",
"volume": "11",
"author": "S Nishimura",
"year": "2006",
"journal-title": "J Orthop Sci"
}
],
"container-title": [
"PLOS ONE"
],
"original-title": [
],
"language": "en",
"link": [
{
"URL": "https://dx.plos.org/10.1371/journal.pone.0233284",
"content-type": "unspecified",
"content-version": "vor",
"intended-application": "similarity-checking"
}
],
"deposited": {
"date-parts": [
[
2020,
5,
29
]
],
"date-time": "2020-05-29T17:54:37Z",
"timestamp": 1590774877000
},
"score": 1,
"resource": {
"primary": {
"URL": "https://dx.plos.org/10.1371/journal.pone.0233284"
}
},
"subtitle": [
],
"editor": [
{
"given": "Rita G.",
"family": "Sobral",
"sequence": "first",
"affiliation": [
]
}
],
"short-title": [
],
"issued": {
"date-parts": [
[
2020,
5,
29
]
]
},
"references-count": 63,
"journal-issue": {
"issue": "5",
"published-online": {
"date-parts": [
[
2020,
5,
29
]
]
}
},
"URL": "http://dx.doi.org/10.1371/journal.pone.0233284",
"relation": {
},
"ISSN": [
"1932-6203"
],
"issn-type": [
{
"value": "1932-6203",
"type": "electronic"
}
],
"subject": [
"Multidisciplinary"
],
"published": {
"date-parts": [
[
2020,
5,
29
]
]
}
}

View File

@ -475,6 +475,86 @@ class CrossrefMappingTest {
}
@Test
def testConvertArticleFromCrossRef2OafSFI(): Unit = {
val json = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/sfi_funded_article.json"))
.mkString
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Publication])
assert(items.nonEmpty)
assert(items.size == 1)
val result: Result = items.head.asInstanceOf[Publication]
assertNotNull(result)
logger.info(mapper.writeValueAsString(result));
assertNotNull(result.getDataInfo, "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo.getProvenanceaction,
"DataInfo/Provenance test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassid.isEmpty,
"DataInfo/Provenance/classId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getClassname.isEmpty,
"DataInfo/Provenance/className test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemeid.isEmpty,
"DataInfo/Provenance/SchemeId test not null Failed"
);
assertFalse(
result.getDataInfo.getProvenanceaction.getSchemename.isEmpty,
"DataInfo/Provenance/SchemeName test not null Failed"
);
assertNotNull(result.getCollectedfrom, "CollectedFrom test not null Failed");
assertFalse(result.getCollectedfrom.isEmpty);
val collectedFromList = result.getCollectedfrom.asScala
assert(
collectedFromList.exists(c => c.getKey.equalsIgnoreCase("10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")),
"Wrong collected from assertion"
)
assert(
collectedFromList.exists(c => c.getValue.equalsIgnoreCase("crossref")),
"Wrong collected from assertion"
)
val relevantDates = result.getRelevantdate.asScala
assert(
relevantDates.exists(d => d.getQualifier.getClassid.equalsIgnoreCase("created")),
"Missing relevant date of type created"
)
val rels = resultList.filter(p => p.isInstanceOf[Relation]).asInstanceOf[List[Relation]]
assertFalse(rels.isEmpty)
rels.foreach(relation => {
assertNotNull(relation)
assertFalse(relation.getSource.isEmpty)
assertFalse(relation.getTarget.isEmpty)
assertFalse(relation.getRelClass.isEmpty)
assertFalse(relation.getRelType.isEmpty)
assertFalse(relation.getSubRelType.isEmpty)
})
}
@Test
def testConvertFromCrossRef2OafIssue(): Unit = {
val json = Source

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.countrypropagation;
package eu.dnetlib.dhp;
import java.io.Serializable;

View File

@ -23,4 +23,5 @@ public class KeyValueSet implements Serializable {
public void setValueSet(ArrayList<String> valueSet) {
this.valueSet = valueSet;
}
}

View File

@ -4,22 +4,21 @@ package eu.dnetlib.dhp;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.*;
public class PropagationConstant {
@ -221,9 +220,28 @@ public class PropagationConstant {
.orElse(Boolean.FALSE);
}
public static void createCfHbforResult(SparkSession spark) {
org.apache.spark.sql.Dataset<Row> cfhb = spark.sql(cfHbforResultQuery);
cfhb.createOrReplaceTempView("cfhb");
// of the results collects the distinct keys for collected from (at the level of the result) and hosted by
// and produces pairs resultId, key for each distinct key associated to the result
public static <R extends Result> void createCfHbforResult(SparkSession spark, String inputPath, String outputPath,
Class<R> resultClazz) {
readPath(spark, inputPath, resultClazz)
.filter(
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible())
.flatMap((FlatMapFunction<R, EntityEntityRel>) r -> {
Set<String> cfhb = r.getCollectedfrom().stream().map(cf -> cf.getKey()).collect(Collectors.toSet());
cfhb.addAll(r.getInstance().stream().map(i -> i.getHostedby().getKey()).collect(Collectors.toSet()));
return cfhb
.stream()
.map(value -> EntityEntityRel.newInstance(r.getId(), value))
.collect(Collectors.toList())
.iterator();
}, Encoders.bean(EntityEntityRel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
public static <R> Dataset<R> readPath(

View File

@ -64,12 +64,6 @@ public class SparkBulkTagJob {
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
@ -86,10 +80,9 @@ public class SparkBulkTagJob {
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
});
spark ->
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc)
);
}
private static <R extends Result> void execBulkTag(
@ -113,6 +106,13 @@ public class SparkBulkTagJob {
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
readPath(spark, outputPath, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
}
public static <R> Dataset<R> readPath(

View File

@ -16,6 +16,7 @@ import javax.print.attribute.DocAttributeSet;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -34,6 +35,7 @@ import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;
/**
* @author miriam.baglioni
@ -44,6 +46,11 @@ public class SparkEoscBulkTag implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkEoscBulkTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static String OPENAIRE_3 = "openaire3.0";
private static String OPENAIRE_4 = "openaire-pub_4.0";
private static String OPENAIRE_CRIS = "openaire-cris_1.1";
private static String OPENAIRE_DATA = "openaire2.0_data";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
@ -72,6 +79,9 @@ public class SparkEoscBulkTag implements Serializable {
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String resultType = parser.get("resultType");
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
@ -82,41 +92,71 @@ public class SparkEoscBulkTag implements Serializable {
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, workingPath);
execBulkTag(spark, inputPath, workingPath, datasourceMapPath, resultClazz);
selectCompliantDatasources(spark, inputPath, workingPath, datasourceMapPath);
execBulkTag(spark, inputPath, workingPath, resultType, resultClazz);
});
}
private static void selectCompliantDatasources(SparkSession spark, String inputPath, String workingPath,
String datasourceMapPath) {
Dataset<Datasource> datasources = readPath(spark, inputPath + "datasource", Datasource.class)
.filter((FilterFunction<Datasource>) ds -> {
final String compatibility = ds.getOpenairecompatibility().getClassid();
return compatibility.equalsIgnoreCase(OPENAIRE_3) ||
compatibility.equalsIgnoreCase(OPENAIRE_4) ||
compatibility.equalsIgnoreCase(OPENAIRE_CRIS) ||
compatibility.equalsIgnoreCase(OPENAIRE_DATA);
});
Dataset<DatasourceMaster> datasourceMaster = readPath(spark, datasourceMapPath, DatasourceMaster.class);
datasources
.joinWith(datasourceMaster, datasources.col("id").equalTo(datasourceMaster.col("master")), "left")
.map(
(MapFunction<Tuple2<Datasource, DatasourceMaster>, DatasourceMaster>) t2 -> t2._2(),
Encoders.bean(DatasourceMaster.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "datasource");
}
private static <R extends Result> void execBulkTag(
SparkSession spark,
String inputPath,
String workingPath,
String datasourceMapPath,
String resultType,
Class<R> resultClazz) {
List<String> hostedByList = readPath(spark, datasourceMapPath, DatasourceMaster.class)
List<String> hostedByList = readPath(spark, workingPath + "datasource", DatasourceMaster.class)
.map((MapFunction<DatasourceMaster, String>) dm -> dm.getMaster(), Encoders.STRING())
.collectAsList();
readPath(spark, inputPath, resultClazz)
.map(patchResult(), Encoders.bean(resultClazz))
.filter(Objects::nonNull)
readPath(spark, inputPath + resultType, resultClazz)
.map(
(MapFunction<R, R>) value -> enrich(value, hostedByList),
Encoders.bean(resultClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
.json(workingPath + resultType);
readPath(spark, workingPath, resultClazz)
readPath(spark, workingPath + resultType, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
.json(inputPath + resultType);
}
private static <R extends Result> R enrich(R value, List<String> hostedByList) {
if (value.getDataInfo().getDeletedbyinference() == null) {
value.getDataInfo().setDeletedbyinference(false);
}
if (value.getContext() == null) {
value.setContext(new ArrayList<>());
}
if (value
.getInstance()
.stream()

View File

@ -8,10 +8,11 @@ import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
@ -20,8 +21,7 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.EntityEntityRel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Datasource;
@ -57,8 +57,8 @@ public class PrepareDatasourceCountryAssociation {
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath {}: ", workingPath);
SparkConf conf = new SparkConf();
@ -66,13 +66,13 @@ public class PrepareDatasourceCountryAssociation {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
removeOutputDir(spark, workingPath + "/datasourceCountry");
prepareDatasourceCountryAssociation(
spark,
Arrays.asList(parser.get("whitelist").split(";")),
Arrays.asList(parser.get("allowedtypes").split(";")),
inputPath,
outputPath);
workingPath + "/datasourceCountry");
});
}

View File

@ -2,20 +2,19 @@
package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
@ -23,6 +22,8 @@ import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.EntityEntityRel;
import eu.dnetlib.dhp.PropagationConstant;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
@ -45,19 +46,18 @@ public class PrepareResultCountrySet {
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String datasourcecountrypath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", datasourcecountrypath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
@ -66,50 +66,31 @@ public class PrepareResultCountrySet {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
removeOutputDir(spark, workingPath + "/preparedInfo/" + resultType);
getPotentialResultToUpdate(
spark,
inputPath,
outputPath,
datasourcecountrypath,
workingPath,
resultClazz);
resultType,
resultClazz);
});
}
private static <R extends Result> void getPotentialResultToUpdate(
SparkSession spark,
String inputPath,
String outputPath,
String datasourcecountrypath,
String workingPath,
String resultType,
Class<R> resultClazz) {
// selects all the results non deleted by inference and non invisible
Dataset<R> result = readPath(spark, inputPath, resultClazz)
.filter(
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() &&
!r.getDataInfo().getInvisible());
final String datasourcecountrypath = workingPath + "/datasourceCountry";
final String cfhbpath = workingPath + "/resultCfHb/" + resultType;
final String outputPath = workingPath + "/preparedInfo/" + resultType;
// of the results collects the distinct keys for collected from (at the level of the result) and hosted by
// and produces pairs resultId, key for each distinct key associated to the result
result.flatMap((FlatMapFunction<R, EntityEntityRel>) r -> {
Set<String> cfhb = r.getCollectedfrom().stream().map(cf -> cf.getKey()).collect(Collectors.toSet());
cfhb.addAll(r.getInstance().stream().map(i -> i.getHostedby().getKey()).collect(Collectors.toSet()));
return cfhb
.stream()
.map(value -> EntityEntityRel.newInstance(r.getId(), value))
.collect(Collectors.toList())
.iterator();
}, Encoders.bean(EntityEntityRel.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/resultCfHb");
PropagationConstant.createCfHbforResult(spark, inputPath, cfhbpath, resultClazz);
Dataset<DatasourceCountry> datasource_country = readPath(spark, datasourcecountrypath, DatasourceCountry.class);
Dataset<EntityEntityRel> cfhb = readPath(spark, workingPath + "/resultCfHb", EntityEntityRel.class);
Dataset<EntityEntityRel> cfhb = readPath(spark, cfhbpath, EntityEntityRel.class);
datasource_country
.joinWith(

View File

@ -9,6 +9,8 @@ import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
@ -47,15 +49,17 @@ public class SparkCountryPropagationJob {
String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath);
String preparedInfoPath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", preparedInfoPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
@ -63,12 +67,12 @@ public class SparkCountryPropagationJob {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
removeOutputDir(spark, workingPath + "/" + resultType);
execPropagation(
spark,
sourcePath,
preparedInfoPath,
outputPath,
workingPath,
resultType,
resultClazz);
});
}
@ -76,18 +80,15 @@ public class SparkCountryPropagationJob {
private static <R extends Result> void execPropagation(
SparkSession spark,
String sourcePath,
String preparedInfoPath,
String outputPath,
String workingPath,
String resultType,
Class<R> resultClazz) {
log.info("Reading Graph table from: {}", sourcePath);
Dataset<R> res = readPath(spark, sourcePath, resultClazz);
log.info("Reading prepared info: {}", preparedInfoPath);
Dataset<ResultCountrySet> prepared = spark
.read()
.json(preparedInfoPath)
.as(Encoders.bean(ResultCountrySet.class));
log.info("Reading prepared info: {}", workingPath + "/preparedInfo/" + resultType);
Dataset<ResultCountrySet> prepared = readPath(spark, workingPath + "/preparedInfo/" + resultType, ResultCountrySet.class);
res
.joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer")
@ -95,7 +96,13 @@ public class SparkCountryPropagationJob {
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
.json(workingPath + "/" + resultType);
readPath(spark, workingPath + "/" + resultType, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(sourcePath);
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.countrypropagation;
package eu.dnetlib.dhp.countrypropagation.pojo;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.countrypropagation;
package eu.dnetlib.dhp.countrypropagation.pojo;
import java.io.Serializable;

View File

@ -1,5 +1,7 @@
package eu.dnetlib.dhp.countrypropagation;
package eu.dnetlib.dhp.countrypropagation.pojo;
import eu.dnetlib.dhp.countrypropagation.pojo.CountrySbs;
import java.io.Serializable;
import java.util.ArrayList;

View File

@ -1,7 +1,9 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
public class AutoritativeAuthor {
import java.io.Serializable;
public class AutoritativeAuthor implements Serializable {
private String name;
private String surname;
@ -40,4 +42,13 @@ public class AutoritativeAuthor {
this.orcid = orcid;
}
public static AutoritativeAuthor newInstance(String name, String surname, String fullname, String orcid) {
AutoritativeAuthor aa = new AutoritativeAuthor();
aa.name = name;
aa.surname = surname;
aa.fullname = fullname;
aa.orcid = orcid;
return aa;
}
}

View File

@ -0,0 +1,90 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import scala.Tuple2;
public class PrepareResultOrcidAssociationStep0 implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep0.class);
public static void main(String[] args) throws Exception {
String jsonConf = IOUtils
.toString(
PrepareResultOrcidAssociationStep0.class
.getResourceAsStream(
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult0_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final List<String> allowedsemrel = Arrays
.stream(parser.get("allowedsemrels").split(";"))
.map(s -> s.toLowerCase())
.collect(Collectors.toList());
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
selectRelations(
spark, inputPath, outputPath, allowedsemrel);
});
}
private static void selectRelations(SparkSession spark, String inputPath, String outputPath,
List<String> allowedsemrel) {
readPath(spark, inputPath, Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference()
&& allowedsemrel.contains(r.getRelClass().toLowerCase()))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -2,26 +2,37 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.sources.v2.reader.InputPartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import scala.Tuple2;
public class PrepareResultOrcidAssociationStep1 {
private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class);
@ -42,83 +53,112 @@ public class PrepareResultOrcidAssociationStep1 {
String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
final String outputPath = parser.get("workingPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
final List<String> allowedsemrel = Arrays
.stream(parser.get("allowedsemrels").split(";"))
.map(s -> s.toLowerCase())
.collect(Collectors.toList());
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
final List<String> allowedPids = Arrays.asList(parser.get("allowedpids").split(";"));
log.info("allowedPids: {}", new Gson().toJson(allowedPids));
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
log.info("resultType: {}", resultType);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
String inputRelationPath = inputPath + "/relation";
log.info("inputRelationPath: {}", inputRelationPath);
String inputResultPath = inputPath + "/" + resultType;
log.info("inputResultPath: {}", inputResultPath);
String outputResultPath = outputPath + "/" + resultType;
log.info("outputResultPath: {}", outputResultPath);
runWithSparkHiveSession(
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
// removeOutputDir(spark, outputPath);
prepareInfo(
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
spark, inputPath, outputPath, resultType, resultClazz, allowedsemrel, allowedPids);
});
}
private static <R extends Result> void prepareInfo(
SparkSession spark,
String inputRelationPath,
String inputResultPath,
String outputResultPath,
String inputPath,
String outputPath,
String resultType,
Class<R> resultClazz,
List<String> allowedsemrel) {
List<String> allowedsemrel,
List<String> allowedPids) {
Dataset<Relation> relation = readPath(spark, inputRelationPath, Relation.class);
relation.createOrReplaceTempView("relation");
final String inputResultPath = inputPath + "/" + resultType;
Dataset<Relation> relation = readPath(spark, outputPath + "/relationSubset", Relation.class);
log.info("Reading Graph table from: {}", inputResultPath);
Dataset<R> result = readPath(spark, inputResultPath, resultClazz);
result.createOrReplaceTempView("result");
String query = "SELECT target resultId, author authorList"
+ " FROM (SELECT id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author "
+ " FROM ( "
+ " SELECT DISTINCT id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid "
+ " FROM result "
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
+ " WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or "
+ " lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
+ " GROUP BY id) r_t "
+ " JOIN ("
+ " SELECT source, target "
+ " FROM relation "
+ " WHERE datainfo.deletedbyinference = false "
+ getConstraintList(" lower(relclass) = '", allowedsemrel)
+ " ) rel_rel "
+ " ON source = id";
final String resultOutputPath = outputPath + "/resultSubset/" + resultType;
log.info("executedQuery: {}", query);
spark
.sql(query)
.as(Encoders.bean(ResultOrcidList.class))
readPath(spark, inputResultPath, resultClazz)
.filter(
(FilterFunction<R>) r -> !r.getDataInfo().getDeletedbyinference() && !r.getDataInfo().getInvisible())
.filter(
(FilterFunction<R>) r -> Optional
.ofNullable(r.getAuthor())
.map(
al -> al
.stream()
.anyMatch(
a -> hasAllowedPid(a, allowedPids)))
.orElse(false)
)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(resultOutputPath);
Dataset<R> result = readPath(spark, resultOutputPath, resultClazz);
// result.foreach((ForeachFunction<R>) r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
result
.joinWith(relation, result.col("id").equalTo(relation.col("source")))
.map((MapFunction<Tuple2<R, Relation>, ResultOrcidList>) t2 -> {
ResultOrcidList rol = new ResultOrcidList();
rol.setResultId(t2._2().getTarget());
List<AutoritativeAuthor> aal = new ArrayList<>();
t2._1().getAuthor().stream().forEach(a -> {
a.getPid().stream().forEach(p -> {
if (allowedPids.contains(p.getQualifier().getClassid().toLowerCase())) {
aal
.add(
AutoritativeAuthor
.newInstance(a.getName(), a.getSurname(), a.getFullname(), p.getValue()));
}
});
});
rol.setAuthorList(aal);
return rol;
}, Encoders.bean(ResultOrcidList.class))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputResultPath);
.json(outputPath + "/" + resultType);
}
private static boolean hasAllowedPid(Author a, List<String> allowedPids) {
Optional<List<StructuredProperty>> oPid = Optional.ofNullable(a.getPid());
if (!oPid.isPresent()) {
return false;
}
return oPid.get().stream().anyMatch(p -> allowedPids.contains(p.getQualifier().getClassid().toLowerCase()));
}
}

View File

@ -10,6 +10,8 @@ import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -50,7 +52,7 @@ public class PrepareResultOrcidAssociationStep2 {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
// removeOutputDir(spark, outputPath);
mergeInfo(spark, inputPath, outputPath);
});
}
@ -63,33 +65,31 @@ public class PrepareResultOrcidAssociationStep2 {
.union(readPath(spark, inputPath + "/software", ResultOrcidList.class));
resultOrcidAssoc
.toJavaRDD()
.mapToPair(r -> new Tuple2<>(r.getResultId(), r))
.reduceByKey(
(a, b) -> {
if (a == null) {
return b;
}
if (b == null) {
return a;
}
.groupByKey((MapFunction<ResultOrcidList, String>) rol -> rol.getResultId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, ResultOrcidList, ResultOrcidList>) (k, it) -> {
ResultOrcidList resultOrcidList = it.next();
if (it.hasNext()) {
Set<String> orcid_set = new HashSet<>();
a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
b
.getAuthorList()
.stream()
.forEach(
aa -> {
if (!orcid_set.contains(aa.getOrcid())) {
a.getAuthorList().add(aa);
orcid_set.add(aa.getOrcid());
}
});
return a;
})
.map(Tuple2::_2)
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
.saveAsTextFile(outputPath, GzipCodec.class);
resultOrcidList.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid()));
it
.forEachRemaining(
val -> val
.getAuthorList()
.stream()
.forEach(
aa -> {
if (!orcid_set.contains(aa.getOrcid())) {
resultOrcidList.getAuthorList().add(aa);
orcid_set.add(aa.getOrcid());
}
}));
}
return resultOrcidList;
}, Encoders.bean(ResultOrcidList.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -2,7 +2,7 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.List;
import java.util.Optional;
@ -56,25 +56,16 @@ public class SparkOrcidToResultFromSemRelJob {
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
if (saveGraph) {
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
}
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
});
}

View File

@ -51,8 +51,7 @@ public class SparkResultToProjectThroughSemRelJob {
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph"));
log.info("saveGraph: {}", saveGraph);
SparkConf conf = new SparkConf();
@ -60,11 +59,9 @@ public class SparkResultToProjectThroughSemRelJob {
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
execPropagation(
spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph);
spark, outputPath, alreadyLinkedPath, potentialUpdatePath);
});
}
@ -72,13 +69,12 @@ public class SparkResultToProjectThroughSemRelJob {
SparkSession spark,
String outputPath,
String alreadyLinkedPath,
String potentialUpdatePath,
Boolean saveGraph) {
String potentialUpdatePath) {
Dataset<ResultProjectSet> toaddrelations = readPath(spark, potentialUpdatePath, ResultProjectSet.class);
Dataset<ResultProjectSet> alreadyLinked = readPath(spark, alreadyLinkedPath, ResultProjectSet.class);
if (saveGraph) {
toaddrelations
.joinWith(
alreadyLinked,
@ -89,7 +85,7 @@ public class SparkResultToProjectThroughSemRelJob {
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath);
}
}
private static FlatMapFunction<Tuple2<ResultProjectSet, ResultProjectSet>, Relation> mapRelationRn() {

View File

@ -56,11 +56,7 @@ public class SparkResultToCommunityFromOrganizationJob {
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
@SuppressWarnings("unchecked")
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
@ -72,10 +68,9 @@ public class SparkResultToCommunityFromOrganizationJob {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
if (saveGraph) {
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
}
});
}
@ -99,6 +94,12 @@ public class SparkResultToCommunityFromOrganizationJob {
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
readPath(spark, outputPath, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
}
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> resultCommunityFn() {

View File

@ -70,13 +70,10 @@ public class SparkResultToCommunityThroughSemRelJob {
conf,
isSparkSessionManaged,
spark -> {
if (isTest(parser)) {
removeOutputDir(spark, outputPath);
}
if (saveGraph) {
execPropagation(
spark, inputPath, outputPath, preparedInfoPath, resultClazz);
}
});
}
@ -100,6 +97,12 @@ public class SparkResultToCommunityThroughSemRelJob {
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
readPath(spark, outputPath, resultClazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath);
}
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> contextUpdaterFn() {

View File

@ -23,4 +23,11 @@ public class DatasourceOrganization implements Serializable {
public void setOrganizationId(String organizationId) {
this.organizationId = organizationId;
}
public static DatasourceOrganization newInstance(String datasourceId, String organizationId) {
DatasourceOrganization dso = new DatasourceOrganization();
dso.datasourceId = datasourceId;
dso.organizationId = organizationId;
return dso;
}
}

View File

@ -2,17 +2,17 @@
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
@ -28,6 +28,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class PrepareResultInstRepoAssociation {
@ -49,14 +50,11 @@ public class PrepareResultInstRepoAssociation {
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("sourcePath");
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath");
log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath);
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
List<String> blacklist = Optional
.ofNullable(parser.get("blacklist"))
@ -64,82 +62,92 @@ public class PrepareResultInstRepoAssociation {
.orElse(new ArrayList<>());
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
runWithSparkHiveSession(
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
readNeededResources(spark, inputPath);
readNeededResources(spark, inputPath, workingPath, blacklist);
removeOutputDir(spark, datasourceOrganizationPath);
prepareDatasourceOrganization(spark, datasourceOrganizationPath, blacklist);
prepareDatasourceOrganization(spark, workingPath);
removeOutputDir(spark, alreadyLinkedPath);
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
prepareAlreadyLinkedAssociation(spark, workingPath);
});
}
private static void readNeededResources(SparkSession spark, String inputPath) {
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
datasource.createOrReplaceTempView("datasource");
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
relation.createOrReplaceTempView("relation");
Dataset<Organization> organization = readPath(spark, inputPath + "/organization", Organization.class);
organization.createOrReplaceTempView("organization");
}
private static void prepareDatasourceOrganization(
SparkSession spark, String datasourceOrganizationPath, List<String> blacklist) {
final String blacklisted = blacklist
.stream()
.map(s -> " AND id != '" + s + "'")
.collect(Collectors.joining());
String query = "SELECT source datasourceId, target organizationId "
+ "FROM ( SELECT id "
+ "FROM datasource "
+ "WHERE datasourcetype.classid = '"
+ INSTITUTIONAL_REPO_TYPE
+ "' "
+ "AND datainfo.deletedbyinference = false " + blacklisted + " ) d "
+ "JOIN ( SELECT source, target "
+ "FROM relation "
+ "WHERE lower(relclass) = '"
+ ModelConstants.IS_PROVIDED_BY.toLowerCase()
+ "' "
+ "AND datainfo.deletedbyinference = false ) rel "
+ "ON d.id = rel.source ";
spark
.sql(query)
.as(Encoders.bean(DatasourceOrganization.class))
private static void readNeededResources(SparkSession spark, String inputPath, String workingPath,
List<String> blacklist) {
readPath(spark, inputPath + "/datasource", Datasource.class)
.filter(
(FilterFunction<Datasource>) ds -> !blacklist.contains(ds.getId()) &&
!ds.getDataInfo().getDeletedbyinference() &&
ds.getDatasourcetype().getClassid().equals(INSTITUTIONAL_REPO_TYPE))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(datasourceOrganizationPath);
.json(workingPath + "/datasource");
readPath(spark, inputPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> !r.getDataInfo().getDeletedbyinference() &&
(r.getRelClass().toLowerCase().equals(ModelConstants.IS_PROVIDED_BY.toLowerCase()) ||
r.getRelClass().toLowerCase().equals(ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase())))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/relation");
}
private static void prepareDatasourceOrganization(
SparkSession spark, String workingPath) {
Dataset<Datasource> datasource = readPath(spark, workingPath + "/datasource", Datasource.class);
Dataset<Relation> relation = readPath(spark, workingPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> r
.getRelClass()
.toLowerCase()
.equals(ModelConstants.IS_PROVIDED_BY.toLowerCase()));
datasource
.joinWith(relation, datasource.col("id").equalTo(relation.col("source")))
.map(
(MapFunction<Tuple2<Datasource, Relation>, DatasourceOrganization>) t2 -> DatasourceOrganization
.newInstance(t2._2().getSource(), t2._2().getTarget()),
Encoders.bean(DatasourceOrganization.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/ datasourceOrganization");
;
}
private static void prepareAlreadyLinkedAssociation(
SparkSession spark, String alreadyLinkedPath) {
String query = "Select source key, collect_set(target) valueSet "
+ "from relation "
+ "where datainfo.deletedbyinference = false "
+ "and lower(relClass) = '"
+ ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase()
+ "' "
+ "group by source";
SparkSession spark, String workingPath) {
readPath(spark, workingPath + "/relation", Relation.class)
.filter(
(FilterFunction<Relation>) r -> r
.getRelClass()
.toLowerCase()
.equals(ModelConstants.HAS_AUTHOR_INSTITUTION.toLowerCase()))
.groupByKey((MapFunction<Relation, String>) r -> r.getSource(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Relation, KeyValueSet>) (k, it) -> {
Set<String> values = new HashSet<>();
KeyValueSet kvs = new KeyValueSet();
kvs.setKey(k);
values.add(it.next().getTarget());
it.forEachRemaining(r -> values.add(r.getTarget()));
kvs.setValueSet(new ArrayList<>(values));
return kvs;
}, Encoders.bean(KeyValueSet.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/alreadyLinked");
spark
.sql(query)
.as(Encoders.bean(KeyValueSet.class))
// TODO retry to stick with datasets
.toJavaRDD()
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
}
}

View File

@ -11,10 +11,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -71,12 +68,6 @@ public class SparkResultToOrganizationFromIstRepoJob {
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final Boolean saveGraph = Optional
.ofNullable(parser.get("saveGraph"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("saveGraph: {}", saveGraph);
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
@ -86,15 +77,15 @@ public class SparkResultToOrganizationFromIstRepoJob {
conf,
isSparkSessionManaged,
spark -> {
if (saveGraph) {
execPropagation(
spark,
datasourceorganization,
alreadylinked,
inputPath,
outputPath,
resultClazz);
}
execPropagation(
spark,
datasourceorganization,
alreadylinked,
inputPath,
outputPath,
resultClazz);
});
}
@ -119,9 +110,15 @@ public class SparkResultToOrganizationFromIstRepoJob {
"left_outer")
.flatMap(createRelationFn(), Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Append)
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
readPath(spark, outputPath, Relation.class)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(inputPath.substring(0, inputPath.lastIndexOf("/") + 1) + "relation");
}
private static FlatMapFunction<Tuple2<KeyValueSet, KeyValueSet>, Relation> createRelationFn() {
@ -159,8 +156,16 @@ public class SparkResultToOrganizationFromIstRepoJob {
Dataset<R> result = readPath(spark, inputPath, resultClazz);
result.createOrReplaceTempView("result");
createCfHbforResult(spark);
Dataset<Row> cfhb = spark
.sql(
"select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
+
"from result r " +
"lateral view explode(instance) i as inst " +
"where r.datainfo.deletedbyinference=false");
// createCfHbforResult(spark);
cfhb.createOrReplaceTempView("cfhb");
dsOrg.createOrReplaceTempView("rels");
return spark

View File

@ -98,13 +98,13 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
String leavesPath,
String childParentPath,
String resultOrganizationPath,
String graphPath,
String relationPath,
String workingPath,
String outputPath,
int iterations) {
if (iterations == 1) {
doPropagateOnce(
spark, leavesPath, childParentPath, resultOrganizationPath, graphPath,
spark, leavesPath, childParentPath, resultOrganizationPath, relationPath,
workingPath, outputPath);
} else {
@ -123,26 +123,26 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
notReachedFirstParent);
doPropagate(
spark, leavesPath, childParentPath, resultOrganizationPath, graphPath,
spark, leavesPath, childParentPath, resultOrganizationPath, relationPath,
workingPath, outputPath, propagationCounter);
}
}
private static void doPropagateOnce(SparkSession spark, String leavesPath, String childParentPath,
String resultOrganizationPath, String graphPath, String workingPath,
String resultOrganizationPath, String relationPath, String workingPath,
String outputPath) {
StepActions
.execStep(
spark, graphPath, workingPath + NEW_RELATION_PATH,
spark, relationPath, workingPath + NEW_RELATION_PATH,
leavesPath, childParentPath, resultOrganizationPath);
addNewRelations(spark, workingPath + NEW_RELATION_PATH, outputPath);
}
private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath,
String resultOrganizationPath, String graphPath, String workingPath, String outputPath,
String resultOrganizationPath, String relationPath, String workingPath, String outputPath,
PropagationCounter propagationCounter) {
int iteration = 0;
long leavesCount;
@ -151,7 +151,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
iteration++;
StepActions
.execStep(
spark, graphPath, workingPath + NEW_RELATION_PATH,
spark, relationPath, workingPath + NEW_RELATION_PATH,
leavesPath, childParentPath, resultOrganizationPath);
StepActions
.prepareForNextStep(
@ -225,7 +225,6 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
, Encoders.bean(Relation.class))
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath);

View File

@ -27,10 +27,10 @@ import scala.Tuple2;
public class StepActions implements Serializable {
public static void execStep(SparkSession spark,
String graphPath, String newRelationPath,
String relationPath, String newRelationPath,
String leavesPath, String chldParentOrgPath, String resultOrgPath) {
Dataset<Relation> relationGraph = readPath(spark, graphPath, Relation.class);
Dataset<Relation> relationGraph = readPath(spark, relationPath, Relation.class);
// select only the relation source target among those proposed by propagation that are not already existent
getNewRels(
newRelationPath, relationGraph,

View File

@ -29,6 +29,13 @@
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "rt",
"paramLongName": "resultType",
"paramDescription": "the result type",
"paramRequired": true
}
]

View File

@ -11,17 +11,12 @@
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "p",
"paramLongName": "preparedInfoPath",
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the path where prepared info have been stored",
"paramRequired": false
"paramRequired": true
},
{
"paramName": "ssm",

View File

@ -6,8 +6,8 @@
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},

View File

@ -5,12 +5,6 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"out",
"paramLongName":"outputPath",
"paramDescription": "the output path",
"paramRequired": true
},
{
"paramName":"w",
"paramLongName":"workingPath",
@ -23,12 +17,7 @@
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName": "p",
"paramLongName": "preparedInfoPath",
"paramDescription": "the path where prepared info have been stored",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",

View File

@ -5,12 +5,6 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"sg",
"paramLongName":"saveGraph",
"paramDescription": "true if the new version of the graph must be saved",
"paramRequired": false
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",

View File

@ -0,0 +1,26 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"as",
"paramLongName":"allowedsemrels",
"paramDescription": "the allowed sematinc relations for propagation",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
]

View File

@ -12,14 +12,14 @@
"paramRequired": true
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris",
"paramName":"ap",
"paramLongName":"allowedpids",
"paramDescription": "the allowed pid type to be used for propagation",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},

View File

@ -1,184 +0,0 @@
<workflow-app name="project_to_result_propagation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>allowedsemrels</name>
<description>the allowed semantics </description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="reset_outputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_publication"/>
<path start="copy_dataset"/>
<path start="copy_orp"/>
<path start="copy_software"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${outputPath}/publication</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${outputPath}/dataset</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_orp">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${outputPath}/software</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="prepare_project_results_association"/>
<action name="prepare_project_results_association">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareProjectResultsAssociation</name>
<class>eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
</spark>
<ok to="apply_propagation"/>
<error to="Kill"/>
</action>
<action name="apply_propagation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ProjectToResultPropagation</name>
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -6,21 +6,9 @@
"paramRequired": true
},
{
"paramName":"h",
"paramLongName":"hive_metastore_uris",
"paramDescription": "the hive metastore uris",
"paramRequired": true
},
{
"paramName":"dop",
"paramLongName":"datasourceOrganizationPath",
"paramDescription": "path where to store/find association from datasource and organization",
"paramRequired": true
},
{
"paramName":"alp",
"paramLongName":"alreadyLinkedPath",
"paramDescription": "path where to store/find already linked results and organizations",
"paramName":"wp",
"paramLongName":"workingPath",
"paramDescription": "path where to store/find prepared/ filtered data",
"paramRequired": true
},
{

View File

@ -1,195 +0,0 @@
<workflow-app name="affiliation_from_semrel_propagation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>sets the outputPath</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="resume_from">
<switch>
<case to="prepare_info">${wf:conf('resumeFrom') eq 'PrepareInfo'}</case>
<default to="reset_outputpath"/> <!-- first action to be done when downloadDump is to be performed -->
</switch>
</decision>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_publication"/>
<path start="copy_dataset"/>
<path start="copy_orp"/>
<path start="copy_software"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${outputPath}/publication</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${outputPath}/dataset</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_orp">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${outputPath}/software</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="prepare_info"/>
<action name="prepare_info">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResultOrganizationAssociation</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.PrepareInfo</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--graphPath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
</spark>
<ok to="apply_resulttoorganization_propagation"/>
<error to="Kill"/>
</action>
<action name="apply_resulttoorganization_propagation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>resultToOrganizationFromSemRel</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.SparkResultToOrganizationFromSemRel</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--workingDir</arg><arg>${workingDir}/working</arg>
<arg>--iterations</arg><arg>${iterations}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,9 @@
## This is a classpath-based import file (this header is required)
orcid_propagation classpath eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app
bulk_tagging classpath eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app
affiliation_inst_repo classpath eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app
affiliation_semantic_relation classpath eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfromsemrel/oozie_app
community_organization classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app
result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app
community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app
country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app

View File

@ -0,0 +1,312 @@
<workflow-app name="enrichment_main" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>allowedsemrelsorcidprop</name>
<description>the semantic relationships allowed for propagation</description>
</property>
<property>
<name>allowedsemrelsresultproject</name>
<description>the allowed semantics </description>
</property>
<property>
<name>allowedsemrelscommunitysemrel</name>
<description>the semantic relationships allowed for propagation</description>
</property>
<property>
<name>datasourceWhitelistForCountryPropagation</name>
<description>the white list</description>
</property>
<property>
<name>allowedtypes</name>
<description>the allowed types</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>organizationtoresultcommunitymap</name>
<description>organization community map</description>
</property>
<property>
<name>isLookUpUrl</name>
<description>the isLookup service endpoint</description>
</property>
<property>
<name>pathMap</name>
<description>the json path associated to each selection field</description>
</property>
<property>
<name>blacklist</name>
<description>list of datasources in blacklist for the affiliation from instrepo propagation</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
</property>
<property>
<name>hiveJdbcUrl</name>
<description>hive server jdbc url</description>
</property>
<property>
<name>hiveMetastoreUris</name>
<description>hive server metastore URIs</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resumeFrom"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="resumeFrom">
<switch>
<case to="bulk_tagging">${wf:conf('resumeFrom') eq 'BulkTagging'}</case>
<case to="affiliation_inst_repo">${wf:conf('resumeFrom') eq 'AffiliationInstitutionalRepository'}</case>
<case to="affiliation_semantic_relation">${wf:conf('resumeFrom') eq 'AffiliationSemanticRelation'}</case>
<case to="community_organization">${wf:conf('resumeFrom') eq 'CommunityOrganization'}</case>
<case to="result_project">${wf:conf('resumeFrom') eq 'ResultProject'}</case>
<case to="community_sem_rel">${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'}</case>
<case to="country_propagation">${wf:conf('resumeFrom') eq 'CountryPropagation'}</case>
<default to="orcid_propagation"/>
</switch>
</decision>
<action name="orcid_propagation">
<sub-workflow>
<app-path>${wf:appPath()}/orcid_propagation
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${sourcePath}</value>
</property>
<property>
<name>allowedsemrels</name>
<value>${allowedsemrelsorcidprop}</value>
</property>
<property>
<name>outputPath</name>
<value>${outputPath}</value>
</property>
</configuration>
</sub-workflow>
<ok to="bulk_tagging" />
<error to="Kill" />
</action>
<action name="bulk_tagging">
<sub-workflow>
<app-path>${wf:appPath()}/bulk_tagging
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>isLookUpUrl</name>
<value>${isLookUpUrl}</value>
</property>
<property>
<name>pathMap</name>
<value>${pathMap}</value>
</property>
</configuration>
</sub-workflow>
<ok to="affiliation_inst_repo" />
<error to="Kill" />
</action>
<action name="affiliation_inst_repo">
<sub-workflow>
<app-path>${wf:appPath()}/affiliation_inst_repo
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>outputPath</name>
<value>${workingDir}/relations</value>
</property>
<property>
<name>blacklist</name>
<value>${blacklist}</value>
</property>
</configuration>
</sub-workflow>
<ok to="affiliation_semantic_relation" />
<error to="Kill" />
</action>
<action name="affiliation_semantic_relation">
<sub-workflow>
<app-path>${wf:appPath()}/affiliation_semantic_relation
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
</configuration>
</sub-workflow>
<ok to="community_organization" />
<error to="Kill" />
</action>
<action name="community_organization">
<sub-workflow>
<app-path>${wf:appPath()}/community_organization
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>organizationtoresultcommunitymap</name>
<value>${organizationtoresultcommunitymap}</value>
</property>
</configuration>
</sub-workflow>
<ok to="result_project" />
<error to="Kill" />
</action>
<action name="result_project">
<sub-workflow>
<app-path>${wf:appPath()}/result_project
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>allowedsemrels</name>
<value>${allowedsemrelsresultproject}</value>
</property>
</configuration>
</sub-workflow>
<ok to="community_sem_rel" />
<error to="Kill" />
</action>
<action name="community_sem_rel">
<sub-workflow>
<app-path>${wf:appPath()}/community_sem_rel
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>allowedsemrels</name>
<value>${allowedsemrelscommunitysemrel}</value>
</property>
</configuration>
</sub-workflow>
<ok to="country_propagation" />
<error to="Kill" />
</action>
<action name="country_propagation">
<sub-workflow>
<app-path>${wf:appPath()}/country_propagation
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>sourcePath</name>
<value>${outputPath}</value>
</property>
<property>
<name>whitelist</name>
<value>${datasourceWhitelistForCountryPropagation}</value>
</property>
<property>
<name>allowedtypes</name>
<value>${allowedtupes}</value>
</property>
</configuration>
</sub-workflow>
<ok to="End" />
<error to="Kill" />
</action>
<end name="End"/>
</workflow-app>

View File

@ -44,65 +44,12 @@
</configuration>
</global>
<start to="reset_outputpath"/>
<start to="fork_exec_bulktag"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="fork_exec_bulktag"/>
<fork name="fork_exec_bulktag">
<path start="join_bulktag_publication"/>
@ -130,7 +77,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/bulktag/publication</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
@ -157,7 +104,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/bulktag/dataset</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
@ -184,7 +131,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/bulktag/otherresearchproduct</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
@ -211,7 +158,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/bulktag/software</arg>
<arg>--pathMap</arg><arg>${pathMap}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
@ -239,7 +186,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscTag</arg>
<arg>--workingPath</arg><arg>${workingDir}/bulktag</arg>
</spark>
<ok to="eosc_get_datasource_master"/>
<error to="Kill"/>
@ -283,7 +230,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/bulktag/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
@ -309,7 +256,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/bulktag/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
@ -334,7 +281,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/software</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/software</arg>
<arg>--workingPath</arg><arg>${workingDir}/bulktag/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
@ -359,14 +306,24 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/eoscContextTag/otherresearchproduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/bulktag/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--datasourceMapPath</arg><arg>${workingDir}/datasourcemaster</arg>
</spark>
<ok to="wait_eosc_context_tag"/>
<error to="Kill"/>
</action>
<join name="wait_eosc_context_tag" to="End"/>
<join name="wait_eosc_context_tag" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -12,11 +12,6 @@
<name>allowedtypes</name>
<description>the allowed types</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
</parameters>
<global>
@ -30,65 +25,13 @@
</configuration>
</global>
<start to="reset_outputpath"/>
<start to="prepare_datasource_country_association"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="prepare_datasource_country_association"/>
<action name="prepare_datasource_country_association">
<spark xmlns="uri:oozie:spark-action:0.2">
@ -110,20 +53,20 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--whitelist</arg><arg>${whitelist}</arg>
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
</spark>
<ok to="fork_join_prepare_result_country"/>
<ok to="fork_prepare_result_country"/>
<error to="Kill"/>
</action>
<fork name="fork_join_prepare_result_country">
<path start="join_prepareresult_publication"/>
<path start="join_prepareresult_dataset"/>
<path start="join_prepareresult_otherresearchproduct"/>
<path start="join_prepareresult_software"/>
<fork name="fork_prepare_result_country">
<path start="prepareresult_publication"/>
<path start="prepareresult_dataset"/>
<path start="prepareresult_otherresearchproduct"/>
<path start="prepareresult_software"/>
</fork>
<action name="join_prepareresult_publication">
<action name="prepareresult_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -144,16 +87,14 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingP</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="wait_prepare"/>
<error to="Kill"/>
</action>
<action name="join_prepareresult_dataset">
<action name="prepareresult_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -174,16 +115,14 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingD</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="wait_prepare"/>
<error to="Kill"/>
</action>
<action name="join_prepareresult_otherresearchproduct">
<action name="prepareresult_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -204,16 +143,14 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingO</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="wait_prepare"/>
<error to="Kill"/>
</action>
<action name="join_prepareresult_software">
<action name="prepareresult_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -234,10 +171,8 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
<arg>--workingPath</arg><arg>${workingDir}/workingS</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
</spark>
<ok to="wait_prepare"/>
<error to="Kill"/>
@ -245,14 +180,14 @@
<join name="wait_prepare" to="fork_join_apply_country_propagation"/>
<fork name="fork_join_apply_country_propagation">
<path start="join_propagation_publication"/>
<path start="join_propagation_dataset"/>
<path start="join_propagation_otherresearchproduct"/>
<path start="join_propagation_software"/>
<fork name="fork_apply_country_propagation">
<path start="propagation_publication"/>
<path start="propagation_dataset"/>
<path start="propagation_otherresearchproduct"/>
<path start="propagation_software"/>
</fork>
<action name="join_propagation_publication">
<action name="propagation_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -273,15 +208,15 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_propagation_dataset">
<action name="propagation_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -302,15 +237,15 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_propagation_otherresearchproduct">
<action name="propagation_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -331,15 +266,15 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="join_propagation_software">
<action name="propagation_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -360,16 +295,22 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
</spark>
</spark>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="End"/>
<join name="wait" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -80,7 +80,37 @@
<error to="Kill"/>
</action>
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
<join name="copy_wait" to="prepare_relations"/>
<action name="prepare_relations">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ORCIDPropagation-PreparePhase0-SelectRELATIONS</name>
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.PrepareResultOrcidAssociationStep0</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.sql.shuffle.partitions=3840
--conf spark.speculation=false
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/orcidprop/relationSubset</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
</spark>
<ok to="fork_prepare_assoc_step1"/>
<error to="Kill"/>
</action>
<fork name="fork_prepare_assoc_step1">
<path start="join_prepare_publication"/>
@ -112,10 +142,10 @@
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
<arg>--workingPath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--allowedpids</arg><arg>${allowedpids}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
@ -140,10 +170,10 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
<arg>--workingPath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--allowedpids</arg><arg>${allowedpids}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
@ -168,10 +198,10 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
<arg>--workingPath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--allowedpids</arg><arg>${allowedpids}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
@ -196,10 +226,10 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
<arg>--workingPath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--allowedpids</arg><arg>${allowedpids}</arg>
</spark>
<ok to="wait"/>
<error to="Kill"/>
@ -225,8 +255,8 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/preparedInfo/targetOrcidAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${workingDir}/orcidprop</arg>
<arg>--outputPath</arg><arg>${workingDir}/orcidprop//mergedOrcidAssoc</arg>
</spark>
<ok to="fork-join-exec-propagation"/>
<error to="Kill"/>
@ -261,9 +291,8 @@
--conf spark.hadoop.mapreduce.reduce.speculative=false
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
</spark>
@ -292,9 +321,8 @@
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
</spark>
@ -323,9 +351,8 @@
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
</spark>
@ -354,9 +381,8 @@
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/preparedInfo/mergedOrcidAssoc</arg>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcidprop/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
</spark>
@ -364,7 +390,16 @@
<error to="Kill"/>
</action>
<join name="wait2" to="End"/>
<join name="wait2" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>

View File

@ -0,0 +1,94 @@
<workflow-app name="project_to_result_propagation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>allowedsemrels</name>
<description>the allowed semantics </description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="prepare_project_results_association"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="prepare_project_results_association">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareProjectResultsAssociation</name>
<class>eu.dnetlib.dhp.projecttoresult.PrepareProjectResultsAssociation</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/resultproject/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/resultproject/preparedInfo/alreadyLinked</arg>
</spark>
<ok to="apply_propagation"/>
<error to="Kill"/>
</action>
<action name="apply_propagation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ProjectToResultPropagation</name>
<class>eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--outputPath</arg><arg>${sourcePath}/relation</arg>
<arg>--potentialUpdatePath</arg><arg>${workingDir}/resultproject/preparedInfo/potentialUpdates</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/resultproject/preparedInfo/alreadyLinked</arg>
</spark>
<ok to="reset_workingDir"/>
<error to="Kill"/>
</action>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -8,10 +8,7 @@
<name>organizationtoresultcommunitymap</name>
<description>organization community map</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
</parameters>
<global>
@ -25,66 +22,12 @@
</configuration>
</global>
<start to="reset_outputpath"/>
<start to="prepare_result_communitylist"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="prepare_result_communitylist"/>
<action name="prepare_result_communitylist">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -104,7 +47,7 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg>
</spark>
@ -137,12 +80,12 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark>
<ok to="wait2"/>
<error to="Kill"/>
@ -166,12 +109,12 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark>
<ok to="wait2"/>
<error to="Kill"/>
@ -195,12 +138,12 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark>
<ok to="wait2"/>
<error to="Kill"/>
@ -224,19 +167,27 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/communityorganization/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/communityorganization/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark>
<ok to="wait2"/>
<error to="Kill"/>
</action>
<join name="wait2" to="End"/>
<join name="wait2" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -18,73 +18,13 @@
</property>
</parameters>
<start to="reset_outputpath"/>
<start to="fork_prepare_assoc_step1"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="fork_prepare_assoc_step1"/>
<fork name="fork_prepare_assoc_step1">
<path start="join_prepare_publication"/>
@ -114,7 +54,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
@ -143,7 +83,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
@ -172,7 +112,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
@ -201,7 +141,7 @@
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--allowedsemrels</arg><arg>${allowedsemrels}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
</spark>
@ -229,8 +169,8 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/preparedInfo/targetCommunityAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
</spark>
<ok to="fork-join-exec-propagation"/>
<error to="Kill"/>
@ -261,12 +201,12 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/publication</arg>
</spark>
<ok to="wait2"/>
<error to="Kill"/>
@ -290,12 +230,12 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/dataset</arg>
</spark>
<ok to="wait2"/>
<error to="Kill"/>
@ -319,12 +259,12 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/otherresearchproduct</arg>
</spark>
<ok to="wait2"/>
<error to="Kill"/>
@ -348,19 +288,26 @@
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/mergedCommunityAssoc</arg>
<arg>--preparedInfoPath</arg><arg>${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--outputPath</arg><arg>${workingDir}/communitysemrel/software</arg>
</spark>
<ok to="wait2"/>
<error to="Kill"/>
</action>
<join name="wait2" to="End"/>
<join name="wait2" to="reset_workingDir"/>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -21,105 +21,12 @@
</configuration>
</global>
<start to="reset_outputpath"/>
<start to="prepare_result_organization_association"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</fs>
<ok to="copy_entities"/>
<error to="Kill"/>
</action>
<fork name="copy_entities">
<path start="copy_relation"/>
<path start="copy_publication"/>
<path start="copy_dataset"/>
<path start="copy_orp"/>
<path start="copy_software"/>
<path start="copy_organization"/>
<path start="copy_projects"/>
<path start="copy_datasources"/>
</fork>
<action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${outputPath}/publication</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${outputPath}/dataset</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_orp">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${outputPath}/software</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg>
</distcp>
<ok to="wait"/>
<error to="Kill"/>
</action>
<join name="wait" to="prepare_result_organization_association"/>
<action name="prepare_result_organization_association">
<spark xmlns="uri:oozie:spark-action:0.2">
@ -138,9 +45,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--workingPath</arg><arg>${workingDir}/affiliationInstRepo</arg>
<arg>--blacklist</arg><arg>${blacklist}</arg>
</spark>
<ok to="fork_join_apply_resulttoorganization_propagation"/>
@ -174,8 +79,8 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
</spark>
@ -203,8 +108,8 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
</spark>
@ -232,8 +137,8 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
</spark>
@ -261,8 +166,8 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
</spark>

View File

@ -0,0 +1,97 @@
<workflow-app name="affiliation_from_semrel_propagation" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="prepare_info"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="prepare_info">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareResultOrganizationAssociation</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.PrepareInfo</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--graphPath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--leavesPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/resultOrgPath</arg>
<arg>--relationPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/relation</arg>
</spark>
<ok to="apply_resulttoorganization_propagation"/>
<error to="Kill"/>
</action>
<action name="apply_resulttoorganization_propagation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>resultToOrganizationFromSemRel</name>
<class>eu.dnetlib.dhp.resulttoorganizationfromsemrel.SparkResultToOrganizationFromSemRel</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--relationPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/relation</arg>
<arg>--outputPath</arg><arg>${sourcePath}</arg>
<arg>--leavesPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/leavesPath</arg>
<arg>--childParentPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/childParentPath</arg>
<arg>--resultOrgPath</arg><arg>${workingDir}/affiliationSemanticRelation/preparedInfo/resultOrgPath</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--workingDir</arg><arg>${workingDir}/affiliationSemanticRelation/working</arg>
<arg>--iterations</arg><arg>${iterations}</arg>
</spark>
<ok to="reset_workingDir"/>
<error to="Kill"/>
</action>
<action name="reset_workingDir">
<fs>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -27,16 +27,11 @@ import org.slf4j.LoggerFactory;
*/
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bulktag.eosc.DatasourceMaster;
import eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
//"50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea" has instance hostedby eosc
//"50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1" has instance hostedby eosc
//"50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7" has two instance one hostedby eosc
//"50|475c1990cbb2::3894c94123e96df8a21249957cf160cb" has EoscTag
public class EOSCContextTaggingTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -78,6 +73,22 @@ public class EOSCContextTaggingTest {
@Test
void EoscContextTagTest() throws Exception {
//"50|475c1990cbb2::0fecfb874d9395aa69d2f4d7cd1acbea" has instance hostedby eosc (cris)
//"50|475c1990cbb2::3185cd5d8a2b0a06bb9b23ef11748eb1" has instance hostedby eosc (zenodo)
//"50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7" has two instance one hostedby eosc (wrong compatibility)
//"50|475c1990cbb2::3894c94123e96df8a21249957cf160cb" has EoscTag
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/datasource/datasource_1").getPath())
.map(
(MapFunction<String, Datasource>) value -> OBJECT_MAPPER.readValue(value, Datasource.class),
Encoders.bean(Datasource.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/datasource");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath())
@ -94,17 +105,24 @@ public class EOSCContextTaggingTest {
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
workingDir.toString() + "/input/dataset",
"-workingPath", workingDir.toString() + "/working/dataset",
workingDir.toString() + "/input/",
"-workingPath", workingDir.toString() + "/working/",
"-datasourceMapPath",
getClass()
.getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster")
.getPath(),
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset"
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-resultType", "dataset"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Assertions
.assertEquals(
2, sc
.textFile(workingDir.toString() + "/working/datasource")
.map(item -> OBJECT_MAPPER.readValue(item, DatasourceMaster.class))
.count());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
@ -113,7 +131,7 @@ public class EOSCContextTaggingTest {
Assertions
.assertEquals(
4,
2,
tmp
.filter(
s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
@ -140,17 +158,17 @@ public class EOSCContextTaggingTest {
Assertions
.assertEquals(
1,
0,
tmp
.filter(
d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb")
d -> d.getId().equals("50|475c1990cbb2::449f28eefccf9f70c04ad70d61e041c7")
&&
d.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count());
Assertions
.assertEquals(
1,
0,
tmp
.filter(
d -> d.getId().equals("50|475c1990cbb2::3894c94123e96df8a21249957cf160cb")
@ -159,4 +177,62 @@ public class EOSCContextTaggingTest {
.count());
}
@Test
void EoscContextTagTestEmptyDatasource() throws Exception {
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/dataset/dataset_10.json").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/bulktag/eosc/datasource/datasource").getPath())
.map(
(MapFunction<String, Datasource>) value -> OBJECT_MAPPER.readValue(value, Datasource.class),
Encoders.bean(Datasource.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/datasource");
SparkEoscBulkTag
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
workingDir.toString() + "/input/",
"-workingPath", workingDir.toString() + "/working/",
"-datasourceMapPath",
getClass()
.getResource("/eu/dnetlib/dhp/bulktag/eosc/datasourceMasterAssociation/datasourceMaster")
.getPath(),
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset",
"-resultType", "dataset"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, tmp.count());
Assertions
.assertEquals(
0,
tmp
.filter(
s -> s.getContext().stream().anyMatch(c -> c.getId().equals("eosc")))
.count());
}
}

View File

@ -0,0 +1,634 @@
package eu.dnetlib.dhp.countrypropagation;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Software;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import scala.Tuple2;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
/**
* @author miriam.baglioni
* @Date 23/11/22
*/
public class CountryPropagationAllStepsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(DatasourceCountryPreparationTest.class.getSimpleName());
SparkConf conf = new SparkConf();
conf.setAppName(DatasourceCountryPreparationTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DatasourceCountryPreparationTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void allStepsTest() throws Exception {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph")
.getPath();
PrepareDatasourceCountryAssociation
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--workingPath", workingDir.toString() + "/country",
"--allowedtypes", "pubsrepository::institutional",
"--whitelist",
"10|openaire____::3795d6478e30e2c9f787d427ff160944;10|opendoar____::16e6a3326dd7d868cbc926602a61e4d0;10|eurocrisdris::fe4903425d9040f680d8610d9079ea14;10|openaire____::5b76240cc27a58c6f7ceef7d8c36660e;10|openaire____::172bbccecf8fca44ab6a6653e84cb92a;10|openaire____::149c6590f8a06b46314eed77bfca693f;10|eurocrisdris::a6026877c1a174d60f81fd71f62df1c1;10|openaire____::4692342f0992d91f9e705c26959f09e0;10|openaire____::8d529dbb05ec0284662b391789e8ae2a;10|openaire____::345c9d171ef3c5d706d08041d506428c;10|opendoar____::1c1d4df596d01da60385f0bb17a4a9e0;10|opendoar____::7a614fd06c325499f1680b9896beedeb;10|opendoar____::1ee3dfcd8a0645a25a35977997223d22;10|opendoar____::d296c101daa88a51f6ca8cfc1ac79b50;10|opendoar____::798ed7d4ee7138d49b8828958048130a;10|openaire____::c9d2209ecc4d45ba7b4ca7597acb88a2;10|eurocrisdris::c49e0fe4b9ba7b7fab717d1f0f0a674d;10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539;10|eurocrisdris::432ca599953ff50cd4eeffe22faf3e48"
});
sc.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication")
.getPath()).saveAsTextFile(workingDir.toString() + "/source/publication");
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/software")
.getPath()).saveAsTextFile(workingDir.toString() + "/source/software");
verifyDatasourceCountry();
PrepareResultCountrySet
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--workingPath", workingDir.toString() + "/country",
"--sourcePath", workingDir.toString() + "/source/publication",
"--resultTableName", Publication.class.getCanonicalName()
});
verifyResultCountrySet();
PrepareResultCountrySet
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--workingPath", workingDir.toString() + "/country",
"--sourcePath", workingDir.toString() + "/source/software",
"--resultTableName", Software.class.getCanonicalName()
});
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath",workingDir.toString() + "/source/publication",
"-resultTableName", Publication.class.getCanonicalName(),
"-workingPath", workingDir.toString() +"/country"
});
verifyPropagationPublication();
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath",workingDir.toString() + "/source/software",
"-resultTableName", Software.class.getCanonicalName(),
"-workingPath", workingDir.toString() + "/country"
});
verifyPropagationSoftware();
}
void verifyDatasourceCountry(){
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<DatasourceCountry> tmp = sc
.textFile(workingDir.toString() + "/country/datasourceCountry")
.map(item -> OBJECT_MAPPER.readValue(item, DatasourceCountry.class));
Assertions.assertEquals(3, tmp.count());
Assertions
.assertEquals(
1, tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::fe4903425d9040f680d8610d9079ea14"))
.count());
Assertions
.assertEquals(
1, tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|opendoar____::f0dd4a99fba6075a9494772b58f95280"))
.count());
Assertions
.assertEquals(
1, tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539"))
.count());
Assertions
.assertEquals(
"NL", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::fe4903425d9040f680d8610d9079ea14"))
.collect()
.get(0)
.getCountry()
.getClassid());
Assertions
.assertEquals(
"Netherlands", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::fe4903425d9040f680d8610d9079ea14"))
.collect()
.get(0)
.getCountry()
.getClassname());
Assertions
.assertEquals(
"IT", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|opendoar____::f0dd4a99fba6075a9494772b58f95280"))
.collect()
.get(0)
.getCountry()
.getClassid());
Assertions
.assertEquals(
"Italy", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|opendoar____::f0dd4a99fba6075a9494772b58f95280"))
.collect()
.get(0)
.getCountry()
.getClassname());
Assertions
.assertEquals(
"FR", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539"))
.collect()
.get(0)
.getCountry()
.getClassid());
Assertions
.assertEquals(
"France", tmp
.filter(
dsc -> dsc
.getDataSourceId()
.equals("10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539"))
.collect()
.get(0)
.getCountry()
.getClassname());
tmp.foreach(e -> System.out.println(OBJECT_MAPPER.writeValueAsString(e)));
}
void verifyResultCountrySet(){
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultCountrySet> tmp = sc
.textFile(workingDir.toString() + "/country/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultCountrySet.class));
Assertions.assertEquals(5, tmp.count());
ResultCountrySet rc = tmp
.filter(r -> r.getResultId().equals("50|06cdd3ff4700::49ec404cee4e1452808aabeaffbd3072"))
.collect()
.get(0);
Assertions.assertEquals(1, rc.getCountrySet().size());
Assertions.assertEquals("NL", rc.getCountrySet().get(0).getClassid());
Assertions.assertEquals("Netherlands", rc.getCountrySet().get(0).getClassname());
rc = tmp
.filter(r -> r.getResultId().equals("50|07b5c0ccd4fe::e7f5459cc97865f2af6e3da964c1250b"))
.collect()
.get(0);
Assertions.assertEquals(1, rc.getCountrySet().size());
Assertions.assertEquals("NL", rc.getCountrySet().get(0).getClassid());
Assertions.assertEquals("Netherlands", rc.getCountrySet().get(0).getClassname());
rc = tmp
.filter(r -> r.getResultId().equals("50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6"))
.collect()
.get(0);
Assertions.assertEquals(2, rc.getCountrySet().size());
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("FR") && cs.getClassname().equals("France")));
rc = tmp
.filter(r -> r.getResultId().equals("50|355e65625b88::74009c567c81b4aa55c813db658734df"))
.collect()
.get(0);
Assertions.assertEquals(2, rc.getCountrySet().size());
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("NL") && cs.getClassname().equals("Netherlands")));
rc = tmp
.filter(r -> r.getResultId().equals("50|355e65625b88::54a1c76f520bb2c8da27d12e42891088"))
.collect()
.get(0);
Assertions.assertEquals(2, rc.getCountrySet().size());
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
rc
.getCountrySet()
.stream()
.anyMatch(cs -> cs.getClassid().equals("FR") && cs.getClassname().equals("France")));
}
void verifyPropagationPublication(){
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/country/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(12, tmp.count());
Assertions.assertEquals(5, tmp.filter(r -> r.getCountry().size() > 0).count());
tmp
.foreach(
r -> r.getCountry().stream().forEach(c -> Assertions.assertEquals("dnet:countries", c.getSchemeid())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(c -> Assertions.assertEquals("dnet:countries", c.getSchemename())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(c -> Assertions.assertFalse(c.getDataInfo().getDeletedbyinference())));
tmp.foreach(r -> r.getCountry().stream().forEach(c -> Assertions.assertFalse(c.getDataInfo().getInvisible())));
tmp.foreach(r -> r.getCountry().stream().forEach(c -> Assertions.assertTrue(c.getDataInfo().getInferred())));
tmp
.foreach(
r -> r.getCountry().stream().forEach(c -> Assertions.assertEquals("0.85", c.getDataInfo().getTrust())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(c -> Assertions.assertEquals("propagation", c.getDataInfo().getInferenceprovenance())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(
c -> Assertions
.assertEquals("country:instrepos", c.getDataInfo().getProvenanceaction().getClassid())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(
c -> Assertions
.assertEquals(
"dnet:provenanceActions", c.getDataInfo().getProvenanceaction().getSchemeid())));
tmp
.foreach(
r -> r
.getCountry()
.stream()
.forEach(
c -> Assertions
.assertEquals(
"dnet:provenanceActions", c.getDataInfo().getProvenanceaction().getSchemename())));
List<Country> countries = tmp
.filter(r -> r.getId().equals("50|06cdd3ff4700::49ec404cee4e1452808aabeaffbd3072"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(1, countries.size());
Assertions.assertEquals("NL", countries.get(0).getClassid());
Assertions.assertEquals("Netherlands", countries.get(0).getClassname());
countries = tmp
.filter(r -> r.getId().equals("50|07b5c0ccd4fe::e7f5459cc97865f2af6e3da964c1250b"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(1, countries.size());
Assertions.assertEquals("NL", countries.get(0).getClassid());
Assertions.assertEquals("Netherlands", countries.get(0).getClassname());
countries = tmp
.filter(r -> r.getId().equals("50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(2, countries.size());
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("FR") && cs.getClassname().equals("France")));
countries = tmp
.filter(r -> r.getId().equals("50|355e65625b88::74009c567c81b4aa55c813db658734df"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(2, countries.size());
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
countries
.stream()
.anyMatch(cs -> cs.getClassid().equals("NL") && cs.getClassname().equals("Netherlands")));
countries = tmp
.filter(r -> r.getId().equals("50|355e65625b88::54a1c76f520bb2c8da27d12e42891088"))
.collect()
.get(0)
.getCountry();
Assertions.assertEquals(2, countries.size());
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("IT") && cs.getClassname().equals("Italy")));
Assertions
.assertTrue(
countries.stream().anyMatch(cs -> cs.getClassid().equals("FR") && cs.getClassname().equals("France")));
}
void verifyPropagationSoftware(){
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/source/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(10, tmp.count());
Dataset<Software> verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Software.class));
Assertions.assertEquals(6, verificationDs.filter("size(country) > 0").count());
Assertions.assertEquals(3, verificationDs.filter("size(country) = 1").count());
Assertions.assertEquals(3, verificationDs.filter("size(country) = 2").count());
Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count());
Dataset<String> countryExploded = verificationDs
.flatMap(
(FlatMapFunction<Software, Country>) row -> row.getCountry().iterator(), Encoders.bean(Country.class))
.map((MapFunction<Country, String>) Qualifier::getClassid, Encoders.STRING());
Assertions.assertEquals(9, countryExploded.count());
Assertions.assertEquals(1, countryExploded.filter("value = 'FR'").count());
Assertions.assertEquals(1, countryExploded.filter("value = 'TR'").count());
Assertions.assertEquals(2, countryExploded.filter("value = 'IT'").count());
Assertions.assertEquals(1, countryExploded.filter("value = 'US'").count());
Assertions.assertEquals(1, countryExploded.filter("value = 'MX'").count());
Assertions.assertEquals(1, countryExploded.filter("value = 'CH'").count());
Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count());
Dataset<Tuple2<String, String>> countryExplodedWithCountryclassid = verificationDs
.flatMap((FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList<>();
List<Country> countryList = row.getCountry();
countryList
.forEach(
c -> prova
.add(
new Tuple2<>(
row.getId(), c.getClassid())));
return prova.iterator();
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Assertions.assertEquals(9, countryExplodedWithCountryclassid.count());
//countryExplodedWithCountryclassid.show(false);
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'FR' ")
.count());
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'TR' ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'IT' or _2 = 'MX') ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'IT' or _2 = 'US') ")
.count());
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'JP'")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassid
.filter(
"_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'CH' or _2 = 'JP') ")
.count());
Dataset<Tuple2<String, String>> countryExplodedWithCountryclassname = verificationDs
.flatMap(
(FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList<>();
List<Country> countryList = row.getCountry();
countryList
.forEach(
c -> prova
.add(
new Tuple2<>(
row.getId(),
c.getClassname())));
return prova.iterator();
},
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
//countryExplodedWithCountryclassname.show(false);
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'France' ")
.count());
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'Turkey' ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'Italy' or _2 = 'Mexico') ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'Italy' or _2 = 'United States') ")
.count());
Assertions
.assertEquals(
1,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'Japan' ")
.count());
Assertions
.assertEquals(
2,
countryExplodedWithCountryclassname
.filter(
"_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'Switzerland' or _2 = 'Japan') ")
.count());
Dataset<Tuple2<String, String>> countryExplodedWithCountryProvenance = verificationDs
.flatMap(
(FlatMapFunction<Software, Tuple2<String, String>>) row -> {
List<Tuple2<String, String>> prova = new ArrayList<>();
List<Country> countryList = row.getCountry();
countryList
.forEach(
c -> prova
.add(
new Tuple2<>(
row.getId(),
c
.getDataInfo()
.getInferenceprovenance())));
return prova.iterator();
},
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
Assertions
.assertEquals(
7, countryExplodedWithCountryProvenance.filter("_2 = 'propagation'").count());
}
}

View File

@ -7,6 +7,7 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.antlr.v4.runtime.misc.Utils;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -69,26 +70,35 @@ public class CountryPropagationJobTest {
@Test
void testCountryPropagationSoftware() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/software")
.getPath();
final String preparedInfoPath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo/software")
.getPath();
SparkCountryPropagationJob
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/software")
.getPath()).saveAsTextFile(workingDir.toString() + "/source/software");
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo/software")
.getPath()).saveAsTextFile(workingDir.toString() + "/preparedInfo/software");
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--sourcePath",workingDir.toString() + "/source/software",
"-resultTableName", Software.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/software",
"-preparedInfoPath", preparedInfoPath
"-workingPath", workingDir.toString(),
"-resultType", "software"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/software")
.textFile(workingDir.toString() + "/source/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(10, tmp.count());
@ -130,7 +140,7 @@ public class CountryPropagationJobTest {
Assertions.assertEquals(9, countryExplodedWithCountryclassid.count());
countryExplodedWithCountryclassid.show(false);
//countryExplodedWithCountryclassid.show(false);
Assertions
.assertEquals(
1,
@ -190,7 +200,7 @@ public class CountryPropagationJobTest {
},
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
countryExplodedWithCountryclassname.show(false);
//countryExplodedWithCountryclassname.show(false);
Assertions
.assertEquals(
1,
@ -259,23 +269,31 @@ public class CountryPropagationJobTest {
@Test
void testCountryPropagationPublication() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication")
.getPath();
final String preparedInfoPath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo/publication")
.getPath();
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-resultTableName", Publication.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/publication",
"-preparedInfoPath", preparedInfoPath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication")
.getPath()).saveAsTextFile(workingDir.toString() + "/source/publication");
sc
.textFile(
getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo/publication")
.getPath()).saveAsTextFile(workingDir.toString() + "/preparedInfo/publication");
SparkCountryPropagationJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath",workingDir.toString() + "/source/publication",
"-resultTableName", Publication.class.getCanonicalName(),
"-workingPath", workingDir.toString(),
"-resultType", "publication"
});
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication")

View File

@ -5,6 +5,7 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import eu.dnetlib.dhp.countrypropagation.pojo.DatasourceCountry;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -63,7 +64,7 @@ public class DatasourceCountryPreparationTest {
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--outputPath", workingDir.toString() + "/datasourceCountry",
"--workingPath", workingDir.toString() + "/country",
"--allowedtypes", "pubsrepository::institutional",
"--whitelist",
"10|openaire____::3795d6478e30e2c9f787d427ff160944;10|opendoar____::16e6a3326dd7d868cbc926602a61e4d0;10|eurocrisdris::fe4903425d9040f680d8610d9079ea14;10|openaire____::5b76240cc27a58c6f7ceef7d8c36660e;10|openaire____::172bbccecf8fca44ab6a6653e84cb92a;10|openaire____::149c6590f8a06b46314eed77bfca693f;10|eurocrisdris::a6026877c1a174d60f81fd71f62df1c1;10|openaire____::4692342f0992d91f9e705c26959f09e0;10|openaire____::8d529dbb05ec0284662b391789e8ae2a;10|openaire____::345c9d171ef3c5d706d08041d506428c;10|opendoar____::1c1d4df596d01da60385f0bb17a4a9e0;10|opendoar____::7a614fd06c325499f1680b9896beedeb;10|opendoar____::1ee3dfcd8a0645a25a35977997223d22;10|opendoar____::d296c101daa88a51f6ca8cfc1ac79b50;10|opendoar____::798ed7d4ee7138d49b8828958048130a;10|openaire____::c9d2209ecc4d45ba7b4ca7597acb88a2;10|eurocrisdris::c49e0fe4b9ba7b7fab717d1f0f0a674d;10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539;10|eurocrisdris::432ca599953ff50cd4eeffe22faf3e48"
@ -72,7 +73,7 @@ public class DatasourceCountryPreparationTest {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<DatasourceCountry> tmp = sc
.textFile(workingDir.toString() + "/datasourceCountry")
.textFile(workingDir.toString() + "/country/datasourceCountry")
.map(item -> OBJECT_MAPPER.readValue(item, DatasourceCountry.class));
Assertions.assertEquals(3, tmp.count());

View File

@ -1,12 +1,11 @@
package eu.dnetlib.dhp.countrypropagation;
import static eu.dnetlib.dhp.PropagationConstant.isSparkSessionManaged;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import eu.dnetlib.dhp.countrypropagation.pojo.ResultCountrySet;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -61,25 +60,25 @@ public class ResultCountryPreparationTest {
.getResource("/eu/dnetlib/dhp/countrypropagation/graph/publication")
.getPath();
final String preparedInfoPath = getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/datasourcecountry")
.getPath();
PrepareResultCountrySet
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--workingPath", workingDir.toString() + "/working",
"--sourcePath", sourcePath,
"--outputPath", workingDir.toString() + "/resultCountry",
"--preparedInfoPath", preparedInfoPath,
"--resultTableName", Publication.class.getCanonicalName()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc.textFile(getClass()
.getResource("/eu/dnetlib/dhp/countrypropagation/datasourcecountry")
.getPath()).saveAsTextFile(workingDir+"/country/datasourceCountry"); ;
PrepareResultCountrySet
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--workingPath", workingDir.toString() + "/country",
"--sourcePath", sourcePath,
"--resultTableName", Publication.class.getCanonicalName()
});
JavaRDD<ResultCountrySet> tmp = sc
.textFile(workingDir.toString() + "/resultCountry")
.textFile(workingDir.toString() + "/country/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultCountrySet.class));
Assertions.assertEquals(5, tmp.count());

View File

@ -80,7 +80,6 @@ public class OrcidPropagationJobTest {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-hive_metastore_uris", "",
"-saveGraph", "true",
"-resultTableName", Dataset.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/dataset",
"-possibleUpdatesPath", possibleUpdatesPath
@ -125,8 +124,6 @@ public class OrcidPropagationJobTest {
.getPath(),
"-hive_metastore_uris",
"",
"-saveGraph",
"true",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",
@ -193,8 +190,6 @@ public class OrcidPropagationJobTest {
.getPath(),
"-hive_metastore_uris",
"",
"-saveGraph",
"true",
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Dataset",
"-outputPath",

View File

@ -0,0 +1,207 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.neethi.Assertion;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareStep1Test {
private static final Logger log = LoggerFactory.getLogger(PrepareStep1Test.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PrepareStep1Test.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PrepareStep1Test.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
spark = SparkSession
.builder()
.appName(PrepareStep1Test.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void noMatchTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparestep1")
.getPath();
PrepareResultOrcidAssociationStep1
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-resultTableName", Dataset.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-allowedsemrels", "IsSupplementedBy;IsSupplementTo",
"-allowedpids", "orcid;orcid_pending"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultOrcidList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
Assertions.assertEquals(0, tmp.count());
Assertions
.assertEquals(
7, sc
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
.count());
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.count());
}
@Test
void matchTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparestep1")
.getPath();
PrepareResultOrcidAssociationStep1
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-resultTableName", Publication.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-allowedsemrels", "IsSupplementedBy;IsSupplementTo",
"-allowedpids", "orcid;orcid_pending"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultOrcidList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
Assertions.assertEquals(1, tmp.count());
tmp.foreach(e -> System.out.println(OBJECT_MAPPER.writeValueAsString(e)));
Assertions
.assertEquals(
1, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.count());
Assertions
.assertEquals(
1, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.size());
Assertions
.assertEquals(
"0000-0002-5001-6911",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getOrcid());
Assertions
.assertEquals(
"Barbarić-Mikočević, Željka",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getFullname());
Assertions
.assertEquals(
"Željka",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getName());
Assertions
.assertEquals(
"Barbarić-Mikočević",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getSurname());
Assertions
.assertEquals(
7, sc
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
.count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class))
.count());
}
}

View File

@ -0,0 +1,222 @@
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareStep2Test {
private static final Logger log = LoggerFactory.getLogger(PrepareStep2Test.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PrepareStep2Test.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(PrepareStep2Test.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
spark = SparkSession
.builder()
.appName(PrepareStep2Test.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testMatch() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/resultSubset")
.getPath();
PrepareResultOrcidAssociationStep2
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/preparedInfo/mergedOrcidAssoc"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultOrcidList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/mergedOrcidAssoc")
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
Assertions.assertEquals(1, tmp.count());
Assertions
.assertEquals(
1,
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.count());
Assertions
.assertEquals(
2, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.size());
Assertions
.assertTrue(
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.stream()
.anyMatch(aa -> aa.getOrcid().equals("0000-0002-1234-5678")));
Assertions
.assertTrue(
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.stream()
.anyMatch(aa -> aa.getOrcid().equals("0000-0002-5001-6911")));
}
@Test
void matchTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparestep1")
.getPath();
PrepareResultOrcidAssociationStep1
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-resultTableName", Publication.class.getCanonicalName(),
"-outputPath", workingDir.toString() + "/preparedInfo",
"-allowedsemrels", "IsSupplementedBy;IsSupplementTo",
"-allowedpids", "orcid;orcid_pending"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<ResultOrcidList> tmp = sc
.textFile(workingDir.toString() + "/preparedInfo/publication")
.map(item -> OBJECT_MAPPER.readValue(item, ResultOrcidList.class));
Assertions.assertEquals(1, tmp.count());
tmp.foreach(e -> System.out.println(OBJECT_MAPPER.writeValueAsString(e)));
Assertions
.assertEquals(
1, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.count());
Assertions
.assertEquals(
1, tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.size());
Assertions
.assertEquals(
"0000-0002-5001-6911",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getOrcid());
Assertions
.assertEquals(
"Barbarić-Mikočević, Željka",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getFullname());
Assertions
.assertEquals(
"Željka",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getName());
Assertions
.assertEquals(
"Barbarić-Mikočević",
tmp
.filter(rol -> rol.getResultId().equals("50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217"))
.collect()
.get(0)
.getAuthorList()
.get(0)
.getSurname());
Assertions
.assertEquals(
7, sc
.textFile(workingDir.toString() + "/preparedInfo/relationSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class))
.count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/preparedInfo/resultSubset")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class))
.count());
}
}

View File

@ -33,32 +33,32 @@ public class ProjectPropagationJobTest {
private static SparkSession spark;
private static Path workingDir;
private static final SparkConf conf = new SparkConf();
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(ProjectPropagationJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(ProjectPropagationJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
.builder()
.appName(ProjectPropagationJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@ -71,6 +71,7 @@ public class ProjectPropagationJobTest {
@Test
void NoUpdateTest() throws Exception {
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
final String potentialUpdateDate = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates")
@ -82,10 +83,10 @@ public class ProjectPropagationJobTest {
SparkResultToProjectThroughSemRelJob
.main(
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdateDate,
"-alreadyLinkedPath", alreadyLinkedPath,
@ -98,6 +99,10 @@ public class ProjectPropagationJobTest {
.map(item -> OBJECT_MAPPER.readValue(item, Relation.class));
Assertions.assertEquals(0, tmp.count());
FileUtils.deleteDirectory(workingDir.toFile());
}
/**
@ -107,6 +112,12 @@ public class ProjectPropagationJobTest {
*/
@Test
void UpdateTenTest() throws Exception {
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
spark = SparkSession
.builder()
.appName(ProjectPropagationJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
final String potentialUpdatePath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates")
@ -118,10 +129,10 @@ public class ProjectPropagationJobTest {
SparkResultToProjectThroughSemRelJob
.main(
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdatePath,
"-alreadyLinkedPath", alreadyLinkedPath,
@ -169,6 +180,9 @@ public class ProjectPropagationJobTest {
.sql(
"Select * from temporary where datainfo.inferenceprovenance = 'propagation'")
.count());
FileUtils.deleteDirectory(workingDir.toFile());
}
/**
@ -179,6 +193,12 @@ public class ProjectPropagationJobTest {
*/
@Test
void UpdateMixTest() throws Exception {
workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName());
spark = SparkSession
.builder()
.appName(ProjectPropagationJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
final String potentialUpdatepath = getClass()
.getResource(
"/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates")
@ -190,10 +210,10 @@ public class ProjectPropagationJobTest {
SparkResultToProjectThroughSemRelJob
.main(
new String[] {
"-isTest", Boolean.TRUE.toString(),
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-hive_metastore_uris", "",
"-saveGraph", "true",
"-outputPath", workingDir.toString() + "/relation",
"-potentialUpdatePath", potentialUpdatepath,
"-alreadyLinkedPath", alreadyLinkedPath,
@ -244,5 +264,7 @@ public class ProjectPropagationJobTest {
.sql(
"Select * from temporary where datainfo.inferenceprovenance = 'propagation'")
.count());
FileUtils.deleteDirectory(workingDir.toFile());
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"resultId":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","authorList":[{"name":"Željka","surname":"Barbarić-Mikočević","fullname":"Barbarić-Mikočević, Željka","orcid":"0000-0002-5001-6911"}]}

View File

@ -0,0 +1 @@
{"resultId":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","authorList":[{"name":"Vesna","surname":"Džimbeg-Malčić","fullname":"Džimbeg-Malčić, Vesna","orcid":"0000-0002-1234-5678"}]}

View File

@ -0,0 +1,18 @@
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementedBy","relType":"datasourceOrganization","source":"50|57a035e5b1ae::0637d444355058eb76ab6d7a842aa8b4","subRelType":"provision","target":"50|475c1990cbb2::02d3c300ac2d07135a6208159c512f62","validated":false}
{"collectedfrom":[{"key":"10|openaire____::21f8a223b9925c2f87c404096080b046","value":"Registry of Research Data Repository"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementedBy","relType":"datasourceOrganization","source":"50|57a035e5b1ae::01894f77220771428abaecbfa2bcc8f7","subRelType":"provision","target":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","validated":false}
{"collectedfrom":[{"key":"10|openaire____::6ac933301a3933c8a22ceebea7000326","value":"Academy of Finland"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementTo","relType":"projectOrganization","source":"50|475c1990cbb2::02d3c300ac2d07135a6208159c512f62","subRelType":"participation","target":"50|57a035e5b1ae::0637d444355058eb76ab6d7a842aa8b4","validated":false}
{"collectedfrom":[{"key":"10|openaire____::6ac933301a3933c8a22ceebea7000326","value":"Academy of Finland"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementTo","relType":"projectOrganization","source":"50|475c1990cbb2::46b9f15a3e887ccb154a696c4e7e4217","subRelType":"participation","target":"50|57a035e5b1ae::01894f77220771428abaecbfa2bcc8f7","validated":false}
{"collectedfrom":[{"key":"10|openaire____::457528c43fabd74e212db2ed61101075","value":"Agence Nationale de la Recherche"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"isSupplementedBy","relType":"projectOrganization","source":"50|57a035e5b1ae::07b10647d24e46073785210d4715f4e9","subRelType":"participation","target":"50|475c1990cbb2::699e01797642d72238c502ffcae18277","validated":false}
{"collectedfrom":[{"key":"10|openaire____::457528c43fabd74e212db2ed61101075","value":"Agence Nationale de la Recherche"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"IsSupplementedBy","relType":"projectOrganization","source":"50|57a035e5b1ae::0cee1d69f1cab270c382eaa853bcf4dc","subRelType":"participation","target":"50|475c1990cbb2::b778659ec5014f3db4c4e03c7907a69d","validated":false}
{"collectedfrom":[{"key":"10|openaire____::457528c43fabd74e212db2ed61101075","value":"Agence Nationale de la Recherche"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"IsSupplementTo","relType":"projectOrganization","source":"50|57a035e5b1ae::0d428b3119b0c822270df15058029172","subRelType":"participation","target":"50|475c1990cbb2::c8172336a860b66965e8d43a5494de2c","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::27b677f5d4a8b3a1159dba624016dc70","subRelType":"participation","target":"20|corda_______::0790e5c820c6a795d2b7524415cefb53","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::b5db617bb0f475b49584f5ee5120227c","subRelType":"participation","target":"20|corda_______::16220fe1781e3beb748872d31aa7f789","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::2907ce789238006cbe07f3e89820c9df","subRelType":"participation","target":"20|corda_______::43edcb7ca35d487ec357959e05c7ed7b","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::d185f413b046d7a7b15808388dad71a5","subRelType":"participation","target":"20|corda_______::46ac0acd65a3c66b10842bf291be9660","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::9c454e23267b520b621199fd4a79e3a6","subRelType":"participation","target":"20|corda_______::86fa29ae6a36610616e1691e1283f807","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::e40925978874b5f57378f301370e1293","subRelType":"participation","target":"20|corda_______::88e4a05f9c42a4830ffdd51663ed4538","validated":false}
{"collectedfrom":[{"key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8","value":"OpenOrgs Database"}],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.990"},"lastupdatetimestamp":1649252022894,"properties":[],"relClass":"merges","relType":"organizationOrganization","source":"20|pending_org_::5a01343420bc742ec1891cd98c36a258","subRelType":"dedup","target":"20|corda_______::a7468d48c5f0517ec67a2a9163af7150","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::531cfba3fa5e10f6be1e42e3c54cc95f","subRelType":"participation","target":"20|corda_______::b2233c6930da222c40e78302385a277d","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::48cb178c2561829bc2eedd787c052d48","subRelType":"participation","target":"20|corda_______::cd8ad1c4f710b667b74362c1674b92e6","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::795be98a5ba4c9190a32fc56033a9540","subRelType":"participation","target":"20|corda_______::f2323f9ed70f0f3a93fdfbb92f715e0e","validated":false}
{"collectedfrom":[{"key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f","value":"CORDA - COmmon Research DAta Warehouse"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1649252022977,"properties":[],"relClass":"hasParticipant","relType":"projectOrganization","source":"40|corda_______::ca5b255e4b2ef49ff424e0019962591c","subRelType":"participation","target":"20|corda_______::f2323f9ed70f0f3a93fdfbb92f715e0e","validated":false}