From e254720377171b78fa105c6f38a0f6558100058d Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Thu, 15 Feb 2024 08:52:28 +0100 Subject: [PATCH] fixed path reports --- .../plugin/base/BaseAnalyzerJob.java | 67 +++++++++---------- .../base/BaseCollectorIteratorTest.java | 25 +++---- 2 files changed, 45 insertions(+), 47 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java index 5df3aad69..cd1216994 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/base/BaseAnalyzerJob.java @@ -48,18 +48,17 @@ public class BaseAnalyzerJob { public static void main(final String[] args) throws Exception { final String jsonConfiguration = IOUtils - .toString( - BaseAnalyzerJob.class - .getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json")); + .toString(BaseAnalyzerJob.class + .getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); @@ -75,11 +74,11 @@ public class BaseAnalyzerJob { } private static void processBaseRecords(final SparkSession spark, - final String inputPath, - final String outputPath) throws IOException { + final String inputPath, + final String outputPath) throws IOException { try (final FileSystem fs = FileSystem.get(new Configuration()); - final AggregatorReport report = new AggregatorReport()) { + final AggregatorReport report = new AggregatorReport()) { final Map fields = new HashMap<>(); final Map types = new HashMap<>(); final Map collections = new HashMap<>(); @@ -97,12 +96,12 @@ public class BaseAnalyzerJob { } private static void analyze(final FileSystem fs, - final String inputPath, - final Map fields, - final Map types, - final Map collections, - final Map totals, - final AggregatorReport report) throws JsonProcessingException, IOException, DocumentException { + final String inputPath, + final Map fields, + final Map types, + final Map collections, + final Map totals, + final AggregatorReport report) throws JsonProcessingException, IOException, DocumentException { final AtomicLong recordsCounter = new AtomicLong(0); @@ -124,27 +123,28 @@ public class BaseAnalyzerJob { final List recTypes = new ArrayList<>(); - for (final Object o : record.selectNodes("//*[local-name()='metadata']//*")) { + for (final Object o : record.selectNodes("//*|//@*")) { incrementMapCounter(fields, ((Node) o).getPath()); final String nodeName = ((Node) o).getName(); - if ("collection".equals(nodeName)) { + if (o instanceof Element) { final Element n = (Element) o; - final String collName = n.getText().trim(); - if (StringUtils.isNotBlank(collName)) { - final Map map = new HashMap<>(); - for (final Object ao : n.attributes()) { - map.put(((Attribute) ao).getName(), ((Attribute) ao).getValue()); + if ("collection".equals(nodeName)) { + final String collName = n.getText().trim(); + if (StringUtils.isNotBlank(collName)) { + final Map map = new HashMap<>(); + for (final Object ao : n.attributes()) { + map.put(((Attribute) ao).getName(), ((Attribute) ao).getValue()); + } + incrementMapCounter(collections, collName + ": " + OBJECT_MAPPER.writeValueAsString(map)); } - - incrementMapCounter(collections, collName + ": " + OBJECT_MAPPER.writeValueAsString(map)); + } else if ("type".equals(nodeName)) { + recTypes.add("TYPE: " + n.getText().trim()); + } else if ("typenorm".equals(nodeName)) { + recTypes.add("TYPE_NORM: " + n.getText().trim()); } - } else if ("type".equals(nodeName)) { - recTypes.add("TYPE: " + nodeName); - } else if ("typenorm".equals(nodeName)) { - recTypes.add("TYPE_NORM: " + nodeName); } } @@ -163,14 +163,11 @@ public class BaseAnalyzerJob { } private static void saveReport(final FileSystem fs, final String outputPath, final Map fields) - throws JsonProcessingException, IOException { + throws JsonProcessingException, IOException { try (final SequenceFile.Writer writer = SequenceFile - .createWriter( - fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer - .keyClass(IntWritable.class), - SequenceFile.Writer - .valueClass(Text.class), - SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) { + .createWriter(fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer + .keyClass(IntWritable.class), SequenceFile.Writer + .valueClass(Text.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) { final Text key = new Text(); final Text value = new Text(); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java index 368bc954e..a677eefa2 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java @@ -47,7 +47,7 @@ public class BaseCollectorIteratorTest { // System.out.println(record.asXML()); - for (final Object o : record.selectNodes("//*[local-name()='metadata']//*")) { + for (final Object o : record.selectNodes("//*|//@*")) { final String path = ((Node) o).getPath(); if (fields.containsKey(path)) { @@ -56,22 +56,23 @@ public class BaseCollectorIteratorTest { fields.put(path, new AtomicInteger(1)); } - if ("collection".equals(((Node) o).getName())) { + if (o instanceof Element) { final Element n = (Element) o; - final String collName = n.getText().trim(); - if (StringUtils.isNotBlank(collName) && !collections.containsKey(collName)) { - final Map collAttrs = new HashMap<>(); - for (final Object ao : n.attributes()) { - collAttrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue()); + + if ("collection".equals(n.getName())) { + final String collName = n.getText().trim(); + if (StringUtils.isNotBlank(collName) && !collections.containsKey(collName)) { + final Map collAttrs = new HashMap<>(); + for (final Object ao : n.attributes()) { + collAttrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue()); + } + collections.put(collName, collAttrs); } - collections.put(collName, collAttrs); + } else if ("type".equals(n.getName())) { + types.add(n.getText().trim()); } - } - if ("type".equals(((Node) o).getName())) { - types.add(((Element) o).getText().trim()); } - } }