dnet-hadoop/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/base/BaseCollectorIteratorTest.java

101 lines
2.5 KiB
Java
Raw Normal View History

2024-02-14 11:39:37 +01:00
package eu.dnetlib.dhp.collection.plugin.base;
import static org.junit.jupiter.api.Assertions.assertEquals;
2024-02-12 12:19:57 +01:00
import java.util.HashMap;
2024-02-14 10:37:39 +01:00
import java.util.HashSet;
2024-02-12 12:19:57 +01:00
import java.util.Map;
2024-02-13 08:36:42 +01:00
import java.util.Map.Entry;
2024-02-14 10:37:39 +01:00
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
2024-02-12 12:19:57 +01:00
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Attribute;
2024-02-13 08:36:42 +01:00
import org.dom4j.Document;
2024-02-15 08:21:52 +01:00
import org.dom4j.DocumentHelper;
2024-02-06 15:10:29 +01:00
import org.dom4j.Element;
2024-02-14 10:37:39 +01:00
import org.dom4j.Node;
import org.junit.jupiter.api.Test;
2024-02-09 12:36:20 +01:00
2024-02-13 08:36:42 +01:00
import com.fasterxml.jackson.databind.ObjectMapper;
2024-02-09 12:36:20 +01:00
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
public class BaseCollectorIteratorTest {
@Test
void testImportFile() throws Exception {
2024-02-09 12:36:20 +01:00
long count = 0;
2024-02-09 12:36:20 +01:00
final BaseCollectorIterator iterator = new BaseCollectorIterator("base-sample.tar", new AggregatorReport());
2024-02-12 12:19:57 +01:00
final Map<String, Map<String, String>> collections = new HashMap<>();
2024-02-14 10:37:39 +01:00
final Map<String, AtomicInteger> fields = new HashMap<>();
final Set<String> types = new HashSet<>();
2024-02-12 12:19:57 +01:00
2024-02-09 12:36:20 +01:00
while (iterator.hasNext()) {
2024-02-15 08:21:52 +01:00
final Document record = DocumentHelper.parseText(iterator.next());
2024-02-12 12:19:57 +01:00
2024-02-09 12:36:20 +01:00
count++;
2024-02-12 12:19:57 +01:00
if ((count % 1000) == 0) {
System.out.println("# Read records: " + count);
}
2024-02-13 08:36:42 +01:00
// System.out.println(record.asXML());
2024-02-15 08:52:28 +01:00
for (final Object o : record.selectNodes("//*|//@*")) {
2024-02-14 10:37:39 +01:00
final String path = ((Node) o).getPath();
2024-02-13 08:36:42 +01:00
2024-02-14 10:37:39 +01:00
if (fields.containsKey(path)) {
fields.get(path).incrementAndGet();
} else {
fields.put(path, new AtomicInteger(1));
}
2024-02-12 12:19:57 +01:00
2024-02-15 08:52:28 +01:00
if (o instanceof Element) {
2024-02-14 10:37:39 +01:00
final Element n = (Element) o;
2024-02-15 08:52:28 +01:00
if ("collection".equals(n.getName())) {
final String collName = n.getText().trim();
if (StringUtils.isNotBlank(collName) && !collections.containsKey(collName)) {
final Map<String, String> collAttrs = new HashMap<>();
for (final Object ao : n.attributes()) {
collAttrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue());
}
collections.put(collName, collAttrs);
2024-02-14 10:37:39 +01:00
}
2024-02-15 08:52:28 +01:00
} else if ("type".equals(n.getName())) {
types.add(n.getText().trim());
2024-02-12 12:19:57 +01:00
}
}
}
2024-02-14 10:37:39 +01:00
2024-02-09 12:36:20 +01:00
}
2024-02-12 12:19:57 +01:00
2024-02-13 08:36:42 +01:00
final ObjectMapper mapper = new ObjectMapper();
for (final Entry<String, Map<String, String>> e : collections.entrySet()) {
System.out.println(e.getKey() + ": " + mapper.writeValueAsString(e.getValue()));
}
2024-02-12 12:19:57 +01:00
2024-02-14 10:37:39 +01:00
for (final Entry<String, AtomicInteger> e : fields.entrySet()) {
System.out.println(e.getKey() + ": " + e.getValue().get());
}
System.out.println("TYPES: ");
for (final String s : types) {
System.out.println(s);
}
assertEquals(30000, count);
}
}