This commit is contained in:
Michele Artini 2024-02-22 14:01:11 +01:00
parent d2b7541583
commit f8cf7ffbcb
4 changed files with 53 additions and 17 deletions

View File

@ -4,9 +4,9 @@ package eu.dnetlib.dhp.collection.plugin.base;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.ArrayList;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
import java.util.Map; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
@ -26,7 +26,6 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.dom4j.Attribute;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper; import org.dom4j.DocumentHelper;
@ -166,7 +165,7 @@ public class BaseAnalyzerJob {
final Set<String> paths = new LinkedHashSet<>(); final Set<String> paths = new LinkedHashSet<>();
final Set<String> types = new LinkedHashSet<>(); final Set<String> types = new LinkedHashSet<>();
final Map<String, Map<String, String>> colls = new HashMap<>(); final List<BaseCollectionInfo> colls = new ArrayList<>();
for (final Object o : record.selectNodes("//*|//@*")) { for (final Object o : record.selectNodes("//*|//@*")) {
paths.add(((Node) o).getPath()); paths.add(((Node) o).getPath());
@ -178,12 +177,13 @@ public class BaseAnalyzerJob {
if ("collection".equals(nodeName)) { if ("collection".equals(nodeName)) {
final String collName = n.getText().trim(); final String collName = n.getText().trim();
if (StringUtils.isNotBlank(collName)) { if (StringUtils.isNotBlank(collName)) {
final Map<String, String> attrs = new HashMap<>(); final BaseCollectionInfo coll = new BaseCollectionInfo();
for (final Object ao : n.attributes()) { coll.setId(collName);
attrs.put(((Attribute) ao).getName(), ((Attribute) ao).getValue()); coll.setOpendoarId(n.valueOf("@opendoar_id").trim());
} coll.setRorId(n.valueOf("@ror_id").trim());
colls.put(collName, attrs); colls.add(coll);
} }
} else if ("type".equals(nodeName)) { } else if ("type".equals(nodeName)) {
types.add("TYPE: " + n.getText().trim()); types.add("TYPE: " + n.getText().trim());

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.collection.plugin.base;
import java.io.Serializable;
public class BaseCollectionInfo implements Serializable {
private static final long serialVersionUID = 5766333937429419647L;
private String id;
private String opendoarId;
private String rorId;
public String getId() {
return this.id;
}
public void setId(final String id) {
this.id = id;
}
public String getOpendoarId() {
return this.opendoarId;
}
public void setOpendoarId(final String opendoarId) {
this.opendoarId = opendoarId;
}
public String getRorId() {
return this.rorId;
}
public void setRorId(final String rorId) {
this.rorId = rorId;
}
}

View File

@ -3,16 +3,14 @@ package eu.dnetlib.dhp.collection.plugin.base;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
public class BaseRecordInfo implements Serializable { public class BaseRecordInfo implements Serializable {
private static final long serialVersionUID = -8848232018350074593L; private static final long serialVersionUID = -8848232018350074593L;
private String id; private String id;
private Map<String, Map<String, String>> collections = new HashMap<>(); private List<BaseCollectionInfo> collections = new ArrayList<>();
private List<String> paths = new ArrayList<>(); private List<String> paths = new ArrayList<>();
private List<String> types = new ArrayList<>(); private List<String> types = new ArrayList<>();
@ -40,11 +38,11 @@ public class BaseRecordInfo implements Serializable {
this.types = types; this.types = types;
} }
public Map<String, Map<String, String>> getCollections() { public List<BaseCollectionInfo> getCollections() {
return this.collections; return this.collections;
} }
public void setCollections(final Map<String, Map<String, String>> collections) { public void setCollections(final List<BaseCollectionInfo> collections) {
this.collections = collections; this.collections = collections;
} }

View File

@ -121,11 +121,11 @@ public class BaseCollectorIteratorTest {
} }
final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext final JavaRDD<BaseRecordInfo> rdd = JavaSparkContext
.fromSparkContext(spark.sparkContext()) .fromSparkContext(spark.sparkContext())
.parallelize(ls); .parallelize(ls);
final Dataset<BaseRecordInfo> df = spark final Dataset<BaseRecordInfo> df = spark
.createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class)); .createDataset(rdd.rdd(), Encoders.bean(BaseRecordInfo.class));
df.printSchema(); df.printSchema();