From f8cf7ffbcb6f89cca36d04c919f6b2fc301fa858 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 22 Feb 2024 14:01:11 +0100 Subject: [PATCH] stats --- .../plugin/base/BaseAnalyzerJob.java | 18 ++++----- .../plugin/base/BaseCollectionInfo.java | 38 +++++++++++++++++++ .../plugin/base/BaseRecordInfo.java | 8 ++-- .../base/BaseCollectorIteratorTest.java | 6 +-- 4 files changed, 53 insertions(+), 17 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectionInfo.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java index 582676560..256915277 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java @@ -4,9 +4,9 @@ package eu.dnetlib.dhp.collection.plugin.base; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.IOException; -import java.util.HashMap; +import java.util.ArrayList; import java.util.LinkedHashSet; -import java.util.Map; +import java.util.List; import java.util.Optional; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; @@ -26,7 +26,6 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; -import org.dom4j.Attribute; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; @@ -166,7 +165,7 @@ public class BaseAnalyzerJob { final Set paths = new LinkedHashSet<>(); final Set types = new LinkedHashSet<>(); - final Map> colls = new HashMap<>(); + final List colls = new ArrayList<>(); for (final Object o : record.selectNodes("//*|//@*")) { paths.add(((Node) o).getPath()); @@ -178,12 +177,13 @@ public class BaseAnalyzerJob { if ("collection".equals(nodeName)) { final String collName = n.getText().trim(); + if (StringUtils.isNotBlank(collName)) { - final Map attrs = new HashMap<>(); - for (final Object ao : n.attributes()) { - attrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue()); - } - colls.put(collName, attrs); + final BaseCollectionInfo coll = new BaseCollectionInfo(); + coll.setId(collName); + coll.setOpendoarId(n.valueOf("@opendoar_id").trim()); + coll.setRorId(n.valueOf("@ror_id").trim()); + colls.add(coll); } } else if ("type".equals(nodeName)) { types.add("TYPE: " + n.getText().trim()); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectionInfo.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectionInfo.java new file mode 100644 index 000000000..06dfe45e2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectionInfo.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.collection.plugin.base; + +import java.io.Serializable; + +public class BaseCollectionInfo implements Serializable { + + private static final long serialVersionUID = 5766333937429419647L; + + private String id; + private String opendoarId; + private String rorId; + + public String getId() { + return this.id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getOpendoarId() { + return this.opendoarId; + } + + public void setOpendoarId(final String opendoarId) { + this.opendoarId = opendoarId; + } + + public String getRorId() { + return this.rorId; + } + + public void setRorId(final String rorId) { + this.rorId = rorId; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseRecordInfo.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseRecordInfo.java index d7d635a0d..0fe6175a7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseRecordInfo.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseRecordInfo.java @@ -3,16 +3,14 @@ package eu.dnetlib.dhp.collection.plugin.base; import java.io.Serializable; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; public class BaseRecordInfo implements Serializable { private static final long serialVersionUID = -8848232018350074593L; private String id; - private Map> collections = new HashMap<>(); + private List collections = new ArrayList<>(); private List paths = new ArrayList<>(); private List types = new ArrayList<>(); @@ -40,11 +38,11 @@ public class BaseRecordInfo implements Serializable { this.types = types; } - public Map> getCollections() { + public List getCollections() { return this.collections; } - public void setCollections(final Map> collections) { + public void setCollections(final List collections) { this.collections = collections; } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java index 57f01445e..6f19b53ff 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java @@ -121,11 +121,11 @@ public class BaseCollectorIteratorTest { } final JavaRDD rdd = JavaSparkContext - .fromSparkContext(spark.sparkContext()) - .parallelize(ls); + .fromSparkContext(spark.sparkContext()) + .parallelize(ls); final Dataset df = spark - .createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class)); + .createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class)); df.printSchema();