diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index b27fc9267..d759f0d55 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -39,15 +39,15 @@ public class ModelConstants { public static final String IS_SUPPLEMENT_TO = "isSupplementTo"; public static final String IS_SUPPLEMENTED_BY = "isSupplementedBy"; public static final String PART = "part"; - public static final String IS_PART_OF = "IsPartOf"; - public static final String HAS_PARTS = "HasParts"; + public static final String IS_PART_OF = "isPartOf"; + public static final String HAS_PARTS = "hasParts"; public static final String RELATIONSHIP = "relationship"; public static final String CITATION = "citation"; public static final String CITES = "cites"; - public static final String IS_CITED_BY = "IsCitedBy"; + public static final String IS_CITED_BY = "isCitedBy"; public static final String REVIEW = "review"; public static final String REVIEWS = "reviews"; - public static final String IS_REVIEWED_BY = "IsReviewedBy"; + public static final String IS_REVIEWED_BY = "isReviewedBy"; public static final String RESULT_PROJECT = "resultProject"; public static final String OUTCOME = "outcome"; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java index 7f0ca983f..b2d3253d5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.project; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Optional; @@ -11,6 +12,7 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.slf4j.Logger; @@ -175,43 +177,54 @@ public class PrepareProgramme { return csvProgramme; }); - prepareClassification(h2020Programmes); + // prepareClassification(h2020Programmes); - h2020Programmes - .map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme)) + JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1); + rdd + .map(csvProgramme -> { + String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme); + return tmp; + }) .saveAsTextFile(outputPath); } - private static void prepareClassification(JavaRDD h2020Programmes) { + private static List prepareClassification(JavaRDD h2020Programmes) { Object[] codedescription = h2020Programmes - .map(value -> new Tuple2<>(value.getCode(), value.getTitle())) + .map( + value -> new Tuple2<>(value.getCode(), + new Tuple2(value.getTitle(), value.getShortTitle()))) .collect() .toArray(); for (int i = 0; i < codedescription.length - 1; i++) { for (int j = i + 1; j < codedescription.length; j++) { - Tuple2 t2i = (Tuple2) codedescription[i]; - Tuple2 t2j = (Tuple2) codedescription[j]; + Tuple2> t2i = (Tuple2>) codedescription[i]; + Tuple2> t2j = (Tuple2>) codedescription[j]; if (t2i._1().compareTo(t2j._1()) > 0) { - Tuple2 temp = t2i; + Tuple2> temp = t2i; codedescription[i] = t2j; codedescription[j] = temp; } } } - Map map = new HashMap<>(); + Map> map = new HashMap<>(); for (int j = 0; j < codedescription.length; j++) { - Tuple2 entry = (Tuple2) codedescription[j]; + Tuple2> entry = (Tuple2>) codedescription[j]; String ent = entry._1(); if (ent.contains("Euratom-")) { ent = ent.replace("-Euratom-", ".Euratom."); } String[] tmp = ent.split("\\."); if (tmp.length <= 2) { - map.put(entry._1(), entry._2()); - + if (StringUtils.isEmpty(entry._2()._2())) { + map.put(entry._1(), new Tuple2(entry._2()._1(), entry._2()._1())); + } else { + map.put(entry._1(), entry._2()); + } } else { if (ent.endsWith(".")) { ent = ent.substring(0, ent.length() - 1); @@ -224,14 +237,14 @@ public class PrepareProgramme { key = key.substring(0, key.length() - 1); } } - String current = entry._2(); + String current = entry._2()._1(); if (!ent.contains("Euratom")) { String parent; String tmp_key = tmp[0] + "."; for (int i = 1; i < tmp.length - 1; i++) { tmp_key += tmp[i] + "."; - parent = map.get(tmp_key).toLowerCase().trim(); + parent = map.get(tmp_key)._1().toLowerCase().trim(); if (parent.contains("|")) { parent = parent.substring(parent.lastIndexOf("|") + 1).trim(); } @@ -246,18 +259,29 @@ public class PrepareProgramme { } } - map.put(ent + ".", map.get(key) + " | " + current); + String shortTitle = entry._2()._2(); + if (StringUtils.isEmpty(shortTitle)) { + shortTitle = current; + } + Tuple2 newEntry = new Tuple2<>(map.get(key)._1() + " | " + current, + map.get(key)._2() + " | " + shortTitle); + map.put(ent + ".", newEntry); } } - h2020Programmes.foreach(csvProgramme -> { - if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom") - && !csvProgramme.getCode().equals("H2020-EC")) - csvProgramme.setClassification(map.get(csvProgramme.getCode() + ".")); - else - csvProgramme.setClassification(map.get(csvProgramme.getCode())); - }); + return h2020Programmes.map(csvProgramme -> { + + String code = csvProgramme.getCode(); + if (!code.endsWith(".") && !code.contains("Euratom") + && !code.equals("H2020-EC")) + code += "."; + + csvProgramme.setClassification(map.get(code)._1()); + csvProgramme.setClassification_short(map.get(code)._2()); + + return csvProgramme; + }).collect(); } public static Dataset readPath( diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java index f2375e799..a583b7bfa 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java @@ -9,7 +9,6 @@ import java.util.Objects; import java.util.Optional; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; @@ -138,7 +137,8 @@ public class SparkAtomicActionJob { pm.setCode(csvProject.getProgramme()); h2020classification.setClassification(ocsvProgramme.get().getClassification()); h2020classification.setH2020Programme(pm); - setLevelsAndProgramme(h2020classification, ocsvProgramme.get().getClassification()); + setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short()); + // setProgramme(h2020classification, ocsvProgramme.get().getClassification()); pp.setH2020classification(Arrays.asList(h2020classification)); return pp; @@ -177,8 +177,8 @@ public class SparkAtomicActionJob { } - private static void setLevelsAndProgramme(H2020Classification h2020Classification, String classification) { - String[] tmp = classification.split(" \\| "); + private static void setLevelsandProgramme(H2020Classification h2020Classification, String classification_short) { + String[] tmp = classification_short.split(" \\| "); h2020Classification.setLevel1(tmp[0]); if (tmp.length > 1) { h2020Classification.setLevel2(tmp[1]); @@ -189,6 +189,12 @@ public class SparkAtomicActionJob { h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]); } +// private static void setProgramme(H2020Classification h2020Classification, String classification) { +// String[] tmp = classification.split(" \\| "); +// +// h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]); +// } + public static Dataset readPath( SparkSession spark, String inputPath, Class clazz) { return spark diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java index 6f9a59087..f991a4297 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java @@ -22,6 +22,15 @@ public class CSVProgramme implements Serializable { private String shortTitle; private String language; private String classification; + private String classification_short; + + public String getClassification_short() { + return classification_short; + } + + public void setClassification_short(String classification_short) { + this.classification_short = classification_short; + } public String getClassification() { return classification; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java index fdc577687..59b536cd5 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/EXCELParserTest.java @@ -9,12 +9,14 @@ import java.util.List; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException; import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; +@Disabled public class EXCELParserTest { private static Path workingDir; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java index c5801bccc..256dc0521 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java @@ -92,6 +92,8 @@ public class PrepareH2020ProgrammeTest { Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count()); + // tmp.foreach(csvProgramme -> System.out.println(OBJECT_MAPPER.writeValueAsString(csvProgramme))); + Assertions .assertEquals( "Societal challenges | Smart, Green And Integrated Transport | CLEANSKY2 | IADP Fast Rotorcraft", diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/SparkUpdateProjectTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/SparkUpdateProjectTest.java index cfda7e718..42e494681 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/SparkUpdateProjectTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/SparkUpdateProjectTest.java @@ -78,7 +78,7 @@ public class SparkUpdateProjectTest { "-programmePath", getClass() .getResource( - "/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz") + "/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz") .getPath(), "-projectPath", getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json").getPath(), @@ -124,7 +124,7 @@ public class SparkUpdateProjectTest { .getString(0)); Assertions .assertEquals( - "Societal challenges", + "Societal Challenges", execverification .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") .select("classification.level1") @@ -133,7 +133,7 @@ public class SparkUpdateProjectTest { .getString(0)); Assertions .assertEquals( - "Smart, Green And Integrated Transport", + "Transport", execverification .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") .select("classification.level2") @@ -188,7 +188,7 @@ public class SparkUpdateProjectTest { .getString(0)); Assertions .assertEquals( - "Nurturing excellence by means of cross-border and cross-sector mobility", + "MSCA Mobility", execverification .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .select("classification.h2020Programme.description") @@ -197,7 +197,7 @@ public class SparkUpdateProjectTest { .getString(0)); Assertions .assertEquals( - "Excellent science", + "Excellent Science", execverification .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .select("classification.level1") @@ -206,7 +206,7 @@ public class SparkUpdateProjectTest { .getString(0)); Assertions .assertEquals( - "Marie Skłodowska-Curie Actions", + "Marie-Sklodowska-Curie Actions", execverification .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .select("classification.level2") @@ -215,7 +215,7 @@ public class SparkUpdateProjectTest { .getString(0)); Assertions .assertEquals( - "Nurturing excellence by means of cross-border and cross-sector mobility", + "MSCA Mobility", execverification .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .select("classification.level3") diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java index 8bcf08906..9370efdb3 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnectorTest.java @@ -6,8 +6,11 @@ import org.apache.commons.logging.LogFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.ssl.SSLContextBuilder; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; + +@Disabled public class HttpConnectorTest { private static final Log log = LogFactory.getLog(HttpConnectorTest.class); diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz deleted file mode 100644 index d40bac8c6..000000000 Binary files a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz and /dev/null differ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz index 1afa73061..01e804ff5 100644 Binary files a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz differ diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz index 60c3bf05a..8b1982dee 100644 Binary files a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/project/whole_programme.json.gz differ diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala index 243719549..b3402ee9f 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala @@ -38,6 +38,8 @@ class QueryTest { def myQuery(spark:SparkSession, sc:SparkContext): Unit = { implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] + + val mapper = new ObjectMapper() mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java index 84f88003b..12543d8c7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctions.java @@ -18,6 +18,7 @@ import eu.dnetlib.dhp.schema.oaf.*; public class CleaningFunctions { public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; + public static final String NONE = "none"; public static T fixVocabularyNames(T value) { if (value instanceof Datasource) { @@ -106,6 +107,23 @@ public class CleaningFunctions { .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .collect(Collectors.toList())); } + if (Objects.nonNull(r.getPid())) { + r + .setPid( + r + .getPid() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue()))) + .filter(sp -> NONE.equalsIgnoreCase(sp.getValue())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(sp -> { + sp.setValue(StringUtils.trim(sp.getValue())); + return sp; + }) + .collect(Collectors.toList())); + } if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { r .setResourcetype( diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala index 4d83057f2..373683190 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerAggregationTest.scala @@ -30,7 +30,7 @@ class SparkScholexplorerAggregationTest { implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] - val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").getOrCreate() + val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate() val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication] diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala index cb04cf9e9..c62d169bc 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/export/ExportDLITOOAFTest.scala @@ -3,14 +3,15 @@ package eu.dnetlib.dhp.export import java.time.LocalDateTime import java.time.format.DateTimeFormatter +import eu.dnetlib.dhp.provision.scholix.Scholix +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary import eu.dnetlib.dhp.schema.oaf.Relation import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} - import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} import org.junit.jupiter.api.Test import scala.io.Source - +import scala.collection.JavaConverters._ class ExportDLITOOAFTest { val mapper = new ObjectMapper() @@ -22,12 +23,27 @@ class ExportDLITOOAFTest { } + def extractDatasources(s:Scholix):List[String]= { + s.getTarget.getCollectedFrom.asScala.map(c => c.getProvider.getName)(collection.breakOut) + } + + + def extractDatasources(s:ScholixSummary):List[String] = { + + s.getDatasources.asScala.map(c => c.getDatasourceName)(collection.breakOut) + + + } + + @Test def testMappingRele():Unit = { val r:Relation = new Relation r.setSource("60|fbff1d424e045eecf24151a5fe3aa738") r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877") + r.setRelType("IsReferencedBy") + val r1 =DLIToOAF.convertDLIRelation(r) println(r1.getSource, r1.getTarget)