2024-02-14 11:39:37 +01:00
|
|
|
|
|
2024-02-05 15:19:41 +01:00
|
|
|
|
package eu.dnetlib.dhp.collection.plugin.base;
|
|
|
|
|
|
|
|
|
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
|
|
|
|
2024-02-16 11:36:46 +01:00
|
|
|
|
import java.util.ArrayList;
|
2024-02-12 12:19:57 +01:00
|
|
|
|
import java.util.HashMap;
|
2024-02-14 10:37:39 +01:00
|
|
|
|
import java.util.HashSet;
|
2024-02-16 11:36:46 +01:00
|
|
|
|
import java.util.List;
|
2024-02-12 12:19:57 +01:00
|
|
|
|
import java.util.Map;
|
2024-02-13 08:36:42 +01:00
|
|
|
|
import java.util.Map.Entry;
|
2024-02-14 10:37:39 +01:00
|
|
|
|
import java.util.Set;
|
|
|
|
|
import java.util.concurrent.atomic.AtomicInteger;
|
2024-02-12 12:19:57 +01:00
|
|
|
|
|
2024-02-16 11:36:46 +01:00
|
|
|
|
import org.apache.commons.io.IOUtils;
|
2024-02-12 12:19:57 +01:00
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
2024-02-16 11:36:46 +01:00
|
|
|
|
import org.apache.spark.api.java.JavaRDD;
|
|
|
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
|
|
|
|
import org.apache.spark.sql.Dataset;
|
|
|
|
|
import org.apache.spark.sql.Encoders;
|
|
|
|
|
import org.apache.spark.sql.SparkSession;
|
2024-02-12 12:19:57 +01:00
|
|
|
|
import org.dom4j.Attribute;
|
2024-02-13 08:36:42 +01:00
|
|
|
|
import org.dom4j.Document;
|
2024-02-15 08:21:52 +01:00
|
|
|
|
import org.dom4j.DocumentHelper;
|
2024-02-06 15:10:29 +01:00
|
|
|
|
import org.dom4j.Element;
|
2024-02-14 10:37:39 +01:00
|
|
|
|
import org.dom4j.Node;
|
2024-02-15 14:27:50 +01:00
|
|
|
|
import org.junit.jupiter.api.Disabled;
|
2024-02-05 15:19:41 +01:00
|
|
|
|
import org.junit.jupiter.api.Test;
|
2024-02-09 12:36:20 +01:00
|
|
|
|
|
2024-02-13 08:36:42 +01:00
|
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
|
|
2024-02-09 12:36:20 +01:00
|
|
|
|
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
2024-02-05 15:19:41 +01:00
|
|
|
|
|
2024-02-15 14:27:50 +01:00
|
|
|
|
@Disabled
|
2024-02-05 15:19:41 +01:00
|
|
|
|
public class BaseCollectorIteratorTest {
|
|
|
|
|
|
|
|
|
|
@Test
|
|
|
|
|
void testImportFile() throws Exception {
|
2024-02-09 12:36:20 +01:00
|
|
|
|
|
2024-02-05 15:19:41 +01:00
|
|
|
|
long count = 0;
|
|
|
|
|
|
2024-02-09 12:36:20 +01:00
|
|
|
|
final BaseCollectorIterator iterator = new BaseCollectorIterator("base-sample.tar", new AggregatorReport());
|
2024-02-05 15:19:41 +01:00
|
|
|
|
|
2024-02-12 12:19:57 +01:00
|
|
|
|
final Map<String, Map<String, String>> collections = new HashMap<>();
|
2024-02-14 10:37:39 +01:00
|
|
|
|
final Map<String, AtomicInteger> fields = new HashMap<>();
|
|
|
|
|
final Set<String> types = new HashSet<>();
|
2024-02-12 12:19:57 +01:00
|
|
|
|
|
2024-02-09 12:36:20 +01:00
|
|
|
|
while (iterator.hasNext()) {
|
2024-02-15 08:21:52 +01:00
|
|
|
|
|
|
|
|
|
final Document record = DocumentHelper.parseText(iterator.next());
|
2024-02-12 12:19:57 +01:00
|
|
|
|
|
2024-02-09 12:36:20 +01:00
|
|
|
|
count++;
|
2024-02-12 12:19:57 +01:00
|
|
|
|
|
|
|
|
|
if ((count % 1000) == 0) {
|
|
|
|
|
System.out.println("# Read records: " + count);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-13 08:36:42 +01:00
|
|
|
|
// System.out.println(record.asXML());
|
|
|
|
|
|
2024-02-15 08:52:28 +01:00
|
|
|
|
for (final Object o : record.selectNodes("//*|//@*")) {
|
2024-02-14 10:37:39 +01:00
|
|
|
|
final String path = ((Node) o).getPath();
|
2024-02-13 08:36:42 +01:00
|
|
|
|
|
2024-02-14 10:37:39 +01:00
|
|
|
|
if (fields.containsKey(path)) {
|
|
|
|
|
fields.get(path).incrementAndGet();
|
|
|
|
|
} else {
|
|
|
|
|
fields.put(path, new AtomicInteger(1));
|
|
|
|
|
}
|
2024-02-12 12:19:57 +01:00
|
|
|
|
|
2024-02-15 08:52:28 +01:00
|
|
|
|
if (o instanceof Element) {
|
2024-02-14 10:37:39 +01:00
|
|
|
|
final Element n = (Element) o;
|
2024-02-15 08:52:28 +01:00
|
|
|
|
|
|
|
|
|
if ("collection".equals(n.getName())) {
|
|
|
|
|
final String collName = n.getText().trim();
|
|
|
|
|
if (StringUtils.isNotBlank(collName) && !collections.containsKey(collName)) {
|
|
|
|
|
final Map<String, String> collAttrs = new HashMap<>();
|
|
|
|
|
for (final Object ao : n.attributes()) {
|
|
|
|
|
collAttrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue());
|
|
|
|
|
}
|
|
|
|
|
collections.put(collName, collAttrs);
|
2024-02-14 10:37:39 +01:00
|
|
|
|
}
|
2024-02-15 08:52:28 +01:00
|
|
|
|
} else if ("type".equals(n.getName())) {
|
|
|
|
|
types.add(n.getText().trim());
|
2024-02-12 12:19:57 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-02-14 10:37:39 +01:00
|
|
|
|
|
2024-02-09 12:36:20 +01:00
|
|
|
|
}
|
2024-02-12 12:19:57 +01:00
|
|
|
|
|
2024-02-13 08:36:42 +01:00
|
|
|
|
final ObjectMapper mapper = new ObjectMapper();
|
|
|
|
|
for (final Entry<String, Map<String, String>> e : collections.entrySet()) {
|
|
|
|
|
System.out.println(e.getKey() + ": " + mapper.writeValueAsString(e.getValue()));
|
|
|
|
|
|
|
|
|
|
}
|
2024-02-12 12:19:57 +01:00
|
|
|
|
|
2024-02-14 10:37:39 +01:00
|
|
|
|
for (final Entry<String, AtomicInteger> e : fields.entrySet()) {
|
|
|
|
|
System.out.println(e.getKey() + ": " + e.getValue().get());
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
System.out.println("TYPES: ");
|
|
|
|
|
for (final String s : types) {
|
|
|
|
|
System.out.println(s);
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-05 15:19:41 +01:00
|
|
|
|
assertEquals(30000, count);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-16 11:36:46 +01:00
|
|
|
|
@Test
|
|
|
|
|
public void testParquet() throws Exception {
|
|
|
|
|
|
|
|
|
|
final String xml = IOUtils.toString(getClass().getResourceAsStream("record.xml"));
|
|
|
|
|
|
|
|
|
|
final SparkSession spark = SparkSession.builder().master("local[*]").getOrCreate();
|
|
|
|
|
|
|
|
|
|
final List<BaseRecordInfo> ls = new ArrayList<>();
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
|
ls.add(BaseAnalyzerJob.extractInfo(xml));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
|
2024-02-22 14:01:11 +01:00
|
|
|
|
.fromSparkContext(spark.sparkContext())
|
|
|
|
|
.parallelize(ls);
|
2024-02-16 11:36:46 +01:00
|
|
|
|
|
|
|
|
|
final Dataset<BaseRecordInfo> df = spark
|
2024-02-22 14:01:11 +01:00
|
|
|
|
.createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class));
|
2024-02-16 11:36:46 +01:00
|
|
|
|
|
|
|
|
|
df.printSchema();
|
|
|
|
|
|
|
|
|
|
df.show(false);
|
|
|
|
|
}
|
2024-02-05 15:19:41 +01:00
|
|
|
|
}
|