fixed path reports

This commit is contained in:
Michele Artini 2024-02-15 08:52:28 +01:00
parent 8d85c1e97e
commit e254720377
2 changed files with 45 additions and 47 deletions

View File

@ -48,18 +48,17 @@ public class BaseAnalyzerJob {
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final String jsonConfiguration = IOUtils final String jsonConfiguration = IOUtils
.toString( .toString(BaseAnalyzerJob.class
BaseAnalyzerJob.class .getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json"));
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/action_set_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
@ -75,11 +74,11 @@ public class BaseAnalyzerJob {
} }
private static void processBaseRecords(final SparkSession spark, private static void processBaseRecords(final SparkSession spark,
final String inputPath, final String inputPath,
final String outputPath) throws IOException { final String outputPath) throws IOException {
try (final FileSystem fs = FileSystem.get(new Configuration()); try (final FileSystem fs = FileSystem.get(new Configuration());
final AggregatorReport report = new AggregatorReport()) { final AggregatorReport report = new AggregatorReport()) {
final Map<String, AtomicLong> fields = new HashMap<>(); final Map<String, AtomicLong> fields = new HashMap<>();
final Map<String, AtomicLong> types = new HashMap<>(); final Map<String, AtomicLong> types = new HashMap<>();
final Map<String, AtomicLong> collections = new HashMap<>(); final Map<String, AtomicLong> collections = new HashMap<>();
@ -97,12 +96,12 @@ public class BaseAnalyzerJob {
} }
private static void analyze(final FileSystem fs, private static void analyze(final FileSystem fs,
final String inputPath, final String inputPath,
final Map<String, AtomicLong> fields, final Map<String, AtomicLong> fields,
final Map<String, AtomicLong> types, final Map<String, AtomicLong> types,
final Map<String, AtomicLong> collections, final Map<String, AtomicLong> collections,
final Map<String, AtomicLong> totals, final Map<String, AtomicLong> totals,
final AggregatorReport report) throws JsonProcessingException, IOException, DocumentException { final AggregatorReport report) throws JsonProcessingException, IOException, DocumentException {
final AtomicLong recordsCounter = new AtomicLong(0); final AtomicLong recordsCounter = new AtomicLong(0);
@ -124,27 +123,28 @@ public class BaseAnalyzerJob {
final List<String> recTypes = new ArrayList<>(); final List<String> recTypes = new ArrayList<>();
for (final Object o : record.selectNodes("//*[local-name()='metadata']//*")) { for (final Object o : record.selectNodes("//*|//@*")) {
incrementMapCounter(fields, ((Node) o).getPath()); incrementMapCounter(fields, ((Node) o).getPath());
final String nodeName = ((Node) o).getName(); final String nodeName = ((Node) o).getName();
if ("collection".equals(nodeName)) { if (o instanceof Element) {
final Element n = (Element) o; final Element n = (Element) o;
final String collName = n.getText().trim(); if ("collection".equals(nodeName)) {
if (StringUtils.isNotBlank(collName)) { final String collName = n.getText().trim();
final Map<String, String> map = new HashMap<>(); if (StringUtils.isNotBlank(collName)) {
for (final Object ao : n.attributes()) { final Map<String, String> map = new HashMap<>();
map.put(((Attribute) ao).getName(), ((Attribute) ao).getValue()); for (final Object ao : n.attributes()) {
map.put(((Attribute) ao).getName(), ((Attribute) ao).getValue());
}
incrementMapCounter(collections, collName + ": " + OBJECT_MAPPER.writeValueAsString(map));
} }
} else if ("type".equals(nodeName)) {
incrementMapCounter(collections, collName + ": " + OBJECT_MAPPER.writeValueAsString(map)); recTypes.add("TYPE: " + n.getText().trim());
} else if ("typenorm".equals(nodeName)) {
recTypes.add("TYPE_NORM: " + n.getText().trim());
} }
} else if ("type".equals(nodeName)) {
recTypes.add("TYPE: " + nodeName);
} else if ("typenorm".equals(nodeName)) {
recTypes.add("TYPE_NORM: " + nodeName);
} }
} }
@ -163,14 +163,11 @@ public class BaseAnalyzerJob {
} }
private static void saveReport(final FileSystem fs, final String outputPath, final Map<String, AtomicLong> fields) private static void saveReport(final FileSystem fs, final String outputPath, final Map<String, AtomicLong> fields)
throws JsonProcessingException, IOException { throws JsonProcessingException, IOException {
try (final SequenceFile.Writer writer = SequenceFile try (final SequenceFile.Writer writer = SequenceFile
.createWriter( .createWriter(fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer
fs.getConf(), SequenceFile.Writer.file(new Path(outputPath)), SequenceFile.Writer .keyClass(IntWritable.class), SequenceFile.Writer
.keyClass(IntWritable.class), .valueClass(Text.class), SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
SequenceFile.Writer
.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new DeflateCodec()))) {
final Text key = new Text(); final Text key = new Text();
final Text value = new Text(); final Text value = new Text();

View File

@ -47,7 +47,7 @@ public class BaseCollectorIteratorTest {
// System.out.println(record.asXML()); // System.out.println(record.asXML());
for (final Object o : record.selectNodes("//*[local-name()='metadata']//*")) { for (final Object o : record.selectNodes("//*|//@*")) {
final String path = ((Node) o).getPath(); final String path = ((Node) o).getPath();
if (fields.containsKey(path)) { if (fields.containsKey(path)) {
@ -56,22 +56,23 @@ public class BaseCollectorIteratorTest {
fields.put(path, new AtomicInteger(1)); fields.put(path, new AtomicInteger(1));
} }
if ("collection".equals(((Node) o).getName())) { if (o instanceof Element) {
final Element n = (Element) o; final Element n = (Element) o;
final String collName = n.getText().trim();
if (StringUtils.isNotBlank(collName) && !collections.containsKey(collName)) { if ("collection".equals(n.getName())) {
final Map<String, String> collAttrs = new HashMap<>(); final String collName = n.getText().trim();
for (final Object ao : n.attributes()) { if (StringUtils.isNotBlank(collName) && !collections.containsKey(collName)) {
collAttrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue()); final Map<String, String> collAttrs = new HashMap<>();
for (final Object ao : n.attributes()) {
collAttrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue());
}
collections.put(collName, collAttrs);
} }
collections.put(collName, collAttrs); } else if ("type".equals(n.getName())) {
types.add(n.getText().trim());
} }
}
if ("type".equals(((Node) o).getName())) {
types.add(((Element) o).getText().trim());
} }
} }
} }