This commit is contained in:
Michele Artini 2024-02-22 14:01:11 +01:00
parent d2b7541583
commit f8cf7ffbcb
4 changed files with 53 additions and 17 deletions

View File

@ -4,9 +4,9 @@ package eu.dnetlib.dhp.collection.plugin.base;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
@ -26,7 +26,6 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Attribute;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
@ -166,7 +165,7 @@ public class BaseAnalyzerJob {
final Set<String> paths = new LinkedHashSet<>();
final Set<String> types = new LinkedHashSet<>();
final Map<String, Map<String, String>> colls = new HashMap<>();
final List<BaseCollectionInfo> colls = new ArrayList<>();
for (final Object o : record.selectNodes("//*|//@*")) {
paths.add(((Node) o).getPath());
@ -178,12 +177,13 @@ public class BaseAnalyzerJob {
if ("collection".equals(nodeName)) {
final String collName = n.getText().trim();
if (StringUtils.isNotBlank(collName)) {
final Map<String, String> attrs = new HashMap<>();
for (final Object ao : n.attributes()) {
attrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue());
}
colls.put(collName, attrs);
final BaseCollectionInfo coll = new BaseCollectionInfo();
coll.setId(collName);
coll.setOpendoarId(n.valueOf("@opendoar_id").trim());
coll.setRorId(n.valueOf("@ror_id").trim());
colls.add(coll);
}
} else if ("type".equals(nodeName)) {
types.add("TYPE: " + n.getText().trim());

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.Serializable;
public class BaseCollectionInfo implements Serializable {
private static final long serialVersionUID = 5766333937429419647L;
private String id;
private String opendoarId;
private String rorId;
public String getId() {
return this.id;
}
public void setId(final String id) {
this.id = id;
}
public String getOpendoarId() {
return this.opendoarId;
}
public void setOpendoarId(final String opendoarId) {
this.opendoarId = opendoarId;
}
public String getRorId() {
return this.rorId;
}
public void setRorId(final String rorId) {
this.rorId = rorId;
}
}

View File

@ -3,16 +3,14 @@ package eu.dnetlib.dhp.collection.plugin.base;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class BaseRecordInfo implements Serializable {
private static final long serialVersionUID = -8848232018350074593L;
private String id;
private Map<String, Map<String, String>> collections = new HashMap<>();
private List<BaseCollectionInfo> collections = new ArrayList<>();
private List<String> paths = new ArrayList<>();
private List<String> types = new ArrayList<>();
@ -40,11 +38,11 @@ public class BaseRecordInfo implements Serializable {
this.types = types;
}
public Map<String, Map<String, String>> getCollections() {
public List<BaseCollectionInfo> getCollections() {
return this.collections;
}
public void setCollections(final Map<String, Map<String, String>> collections) {
public void setCollections(final List<BaseCollectionInfo> collections) {
this.collections = collections;
}

View File

@ -121,11 +121,11 @@ public class BaseCollectorIteratorTest {
}
final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
.fromSparkContext(spark.sparkContext())
.parallelize(ls);
.fromSparkContext(spark.sparkContext())
.parallelize(ls);
final Dataset<BaseRecordInfo> df = spark
.createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class));
.createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class));
df.printSchema();