First implementation of the csv dump

This commit is contained in:
Miriam Baglioni 2023-05-29 10:16:47 +02:00
parent f79b9d5c0d
commit 2e0999a1df
31 changed files with 1090 additions and 528 deletions

View File

@ -10,10 +10,13 @@ import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.csv.Constants;
import eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -22,6 +25,8 @@ public class QueryInformationSystem {
private ISLookUpService isLookUp;
private static final Logger log = LoggerFactory.getLogger(QueryInformationSystem.class);
private static final String XQUERY_ALL = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
+
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
@ -79,6 +84,7 @@ public class QueryInformationSystem {
List<String> communities = new ArrayList<>();
for (String xml : isLookUp.quickSearchProfile(toString)) {
log.info(xml);
final Document doc;
final SAXReader reader = new SAXReader();
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
@ -91,7 +97,12 @@ public class QueryInformationSystem {
builder.append(Constants.SEP);
builder.append(root.attribute("id").getValue());
builder.append(Constants.SEP);
builder.append(((Node) (root.selectNodes("/description").get(0))).getText());
builder
.append(
((Node) (root.selectNodes("//description").get(0)))
.getText()
.replace("\n", " ")
.replace("\t", " "));
communities.add(builder.toString());
}
return communities;

View File

@ -17,6 +17,15 @@ public class AuthorResult implements Serializable {
private String orcid;
private String resultId;
private String rank;
private Boolean fromOrcid;
public Boolean getFromOrcid() {
return fromOrcid;
}
public void setFromOrcid(Boolean fromOrcid) {
this.fromOrcid = fromOrcid;
}
public String getFullName() {
return fullName;
@ -86,7 +95,7 @@ public class AuthorResult implements Serializable {
if (orcid != null) {
authorId = DHPUtils.md5(orcid);
} else {
authorId = DHPUtils.md5(resultId + rank);
authorId = DHPUtils.md5(resultId + rank);
}
}

View File

@ -1,11 +1,19 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.commons.lang3.StringUtils.split;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
@ -18,91 +26,94 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
/**
* @author miriam.baglioni
* @Date 09/05/23
*/
//STEP 1
public class DumpCommunities implements Serializable {
private static final Logger log = LoggerFactory.getLogger(DumpCommunities.class);
private final BufferedWriter writer;
private static final Logger log = LoggerFactory.getLogger(DumpCommunities.class);
private final BufferedWriter writer;
private final static String HEADER = "id" + Constants.SEP + "name" + Constants.SEP + "acronym" + Constants.SEP
+ " description \n";
private final transient QueryInformationSystem queryInformationSystem;
private final transient QueryInformationSystem queryInformationSystem;
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpCommunities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste1.json"));
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
DumpCommunities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step3.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String nameNode = parser.get("nameNode");
log.info("nameNode: {}", nameNode);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final List<String> communities = Arrays.asList(split(parser.get("communities"), ";"));
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final DumpCommunities dc = new DumpCommunities(outputPath, nameNode, parser.get("isLookUpUrl"));
final String workingPath = parser.get("workingPath");
dc.writeCommunity(communities);
final String nameNode = parser.get("nameNode");
log.info("nameNode: {}", nameNode);
}
private void writeCommunity(List<String> communities)
throws IOException, ISLookUpException, DocumentException, SAXException {
writer.write(HEADER);
writer.flush();
String a = IOUtils
.toString(
DumpCommunities.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq"));
final DumpCommunities dc = new DumpCommunities(outputPath, nameNode, parser.get("isLookUp)"));
final String xquery = String
.format(
a,
communities
.stream()
.map(t -> String.format("$x//CONFIGURATION/context[./@id= '%s']", t))
.collect(Collectors.joining(" or ")));
dc.writeCommunity();
for (String community : queryInformationSystem
.getCommunityCsv(xquery)) {
writer
.write(
community);
writer.write("\n");
}
}
writer.close();
}
private void writeCommunity() throws IOException, ISLookUpException, DocumentException, SAXException {
for(String community : queryInformationSystem.getCommunityCsv(IOUtils.toString(
DumpCommunities.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq"))))
{
writer
.write(
community);
writer.write("\n");
public DumpCommunities(String hdfsPath, String hdfsNameNode, String isLookUpUrl) throws Exception {
final Configuration conf = new Configuration();
queryInformationSystem = new QueryInformationSystem();
queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl));
}
writer.close();
}
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
Path hdfsWritePath = new Path(hdfsPath);
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
public DumpCommunities(String hdfsPath, String hdfsNameNode, String isLookUpUrl) throws Exception {
final Configuration conf = new Configuration();
queryInformationSystem= new QueryInformationSystem();
queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl));
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
Path hdfsWritePath = new Path(hdfsPath);
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
FSDataOutputStream fos = fileSystem.create(hdfsWritePath);
writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
}
writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8));
}
}

View File

@ -9,8 +9,6 @@ import java.util.*;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
@ -20,10 +18,13 @@ import org.apache.spark.sql.Dataset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
@ -59,9 +60,6 @@ public class SparkDumpResults implements Serializable {
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultType = parser.get("resultType");
log.info("resultType: {}", resultType);
@ -78,14 +76,14 @@ public class SparkDumpResults implements Serializable {
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
run(spark, inputPath, outputPath, inputClazz, resultType, workingPath);
// Utils.removeOutputDir(spark, outputPath);
run(spark, inputPath, inputClazz, resultType, workingPath);
});
}
private static <R extends Result> void run(SparkSession spark, String inputPath, String outputPath,
private static <R extends Result> void run(SparkSession spark, String inputPath,
Class<R> inputClazz, String resultType, String workingPath) {
Dataset<String> resultIds = spark.read().textFile(workingPath + "/resultIds");
@ -94,85 +92,104 @@ public class SparkDumpResults implements Serializable {
.filter(
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible());
// map results
resultIds
.joinWith(results, resultIds.col("value").equalTo(results.col("id")))
.map((MapFunction<Tuple2<String, R>, R>) t2 -> t2._2(), Encoders.bean(inputClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/" + resultType + "/temp/result");
// map results
results = Utils.readPath(spark, workingPath + "/" + resultType + "/temp/result", inputClazz);
results
.map(
(MapFunction<Tuple2<String, R>, CSVResult>) t2 -> mapResultInfo(t2._2()),
(MapFunction<R, CSVResult>) r -> mapResultInfo(r),
Encoders.bean(CSVResult.class))
.write()
.option("compression", "gzip")
.option("header","true")
.option("delimiter",Constants.SEP)
// .option("header", "true")
// .option("delimiter", Constants.SEP)
.mode(SaveMode.Overwrite)
.csv(workingPath + "/" + resultType + "/result");
.json(workingPath + "/" + resultType + "/result");
// map relations between pid and result
resultIds
.joinWith(results, resultIds.col("value").equalTo(results.col("id")))
.flatMap((FlatMapFunction<Tuple2<String, R>, CSVPid>) t2 -> {
results
.flatMap((FlatMapFunction<R, CSVPid>) r -> {
List<CSVPid> pids = new ArrayList<>();
if (Optional.ofNullable(t2._2().getPid()).isPresent() && t2._2().getPid().size() > 0) {
pids.addAll(mapPid(t2._2().getPid(), t2._1()));
if (Optional.ofNullable(r.getPid()).isPresent() && r.getPid().size() > 0) {
pids.addAll(mapPid(r.getPid(), r.getId()));
}
return pids.iterator();
}, Encoders.bean(CSVPid.class))
.filter(Objects::nonNull)
.write()
.option("compression", "gzip")
.option("header","true")
.option("delimiter", Constants.SEP)
// .option("header", "true")
// .option("delimiter", Constants.SEP)
.mode(SaveMode.Overwrite)
.csv(workingPath + "/" + resultType + "/result_pid");
.json(workingPath + "/" + resultType + "/result_pid");
// map authors from the result
// per ogni autore nel result
// se l'autore ha un orcid il suo id dipende dall'orcid (tipo md5(orcid))
// se non ha orcid il suo id si costruisce come result_id + authorrank ( se non ha il rank si sua
// se non ha orcid il suo id si costruisce come result_id + authorrank ( se non ha il rank si sua
// la sua posizione nell'insieme degli autori) sempre con md5
Dataset<AuthorResult> authorResult = resultIds
.joinWith(results, resultIds.col("value").equalTo(results.col("id")))
.flatMap((FlatMapFunction<Tuple2<String, R>, AuthorResult>) t2 -> {
results
.flatMap((FlatMapFunction<R, AuthorResult>) r -> {
int count = 0;
List<AuthorResult> arl = new ArrayList<>();
for (Author a : t2._2().getAuthor()) {
count += 1;
AuthorResult ar = new AuthorResult();
ar.setResultId(t2._1());
if (Optional.ofNullable(a.getRank()).isPresent()) {
if (a.getRank() > 0) {
ar.setRank(String.valueOf(a.getRank()));
} else {
ar.setRank(String.valueOf(count));
if (Optional.ofNullable(r.getAuthor()).isPresent()) {
for (Author a : r.getAuthor()) {
count += 1;
AuthorResult ar = new AuthorResult();
ar.setResultId(r.getId());
if (Optional.ofNullable(a.getRank()).isPresent()) {
if (a.getRank() > 0) {
ar.setRank(String.valueOf(a.getRank()));
} else {
ar.setRank(String.valueOf(count));
}
}
ar.setFirstName(a.getName());
ar.setLastName(a.getSurname());
ar.setFullName(a.getFullname());
Tuple2<String, Boolean> orcid = getOrcid(a.getPid());
if (Optional.ofNullable(orcid).isPresent()) {
ar.setOrcid(orcid._1());
ar.setFromOrcid(orcid._2());
}
ar.autosetId();
arl.add(ar);
}
ar.setFirstName(a.getName());
ar.setLastName(a.getSurname());
ar.setFullName(a.getFullname());
ar.setOrcid(getOrcid(a.getPid()));
ar.autosetId();
arl.add(ar);
}
return arl.iterator();
}, Encoders.bean(AuthorResult.class));
}, Encoders.bean(AuthorResult.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath + "/" + resultType + "/temp/authorresult");
Dataset<AuthorResult> authorResult = Utils
.readPath(spark, workingPath + "/" + resultType + "/temp/authorresult", AuthorResult.class);
// map the relation between author and result
authorResult
.map(
(MapFunction<AuthorResult, CSVRelResAut>) ar -> {
CSVRelResAut ret = new CSVRelResAut();
ret.setResult_id(ar.getResultId() );
ret.setAuthor_id( ar.getAuthorId());
ret.setResult_id(ar.getResultId());
ret.setAuthor_id(ar.getAuthorId());
return ret;
},
Encoders.bean(CSVRelResAut.class))
.write()
.option("compression", "gzip")
.option("header","true")
.option("delimiter",Constants.SEP)
// .option("header", "true")
// .option("delimiter", Constants.SEP)
.mode(SaveMode.Overwrite)
.csv(workingPath + "/" + resultType + "/result_author");
.json(workingPath + "/" + resultType + "/result_author");
// ma the authors in the working dir. I do not want to have them repeated
authorResult
@ -182,23 +199,28 @@ public class SparkDumpResults implements Serializable {
Encoders.bean(CSVAuthor.class))
.write()
.option("compression", "gzip")
.option("header","true")
.option("delimiter",Constants.SEP)
// .option("header", "true")
// .option("delimiter", Constants.SEP)
.mode(SaveMode.Overwrite)
.csv(workingPath + "/" + resultType + "/author");
.json(workingPath + "/" + resultType + "/author");
}
private static List<CSVPid> mapPid(List<StructuredProperty> pid, String resultId) {
return pid.stream().map(p -> p.getQualifier().getClassid().toLowerCase() + "@" + p.getValue().toLowerCase()).distinct().map(p -> {
CSVPid ret = new CSVPid();
ret.setId(DHPUtils.md5(p));
ret.setResult_id(resultId);
ret.setPid(split(p, "@")[1]);
ret.setType(split(p, "@")[0]);
return pid
.stream()
.map(p -> p.getQualifier().getClassid().toLowerCase() + "@" + p.getValue().toLowerCase())
.distinct()
.map(p -> {
CSVPid ret = new CSVPid();
ret.setId(DHPUtils.md5(p));
ret.setResult_id(resultId);
ret.setPid(split(p, "@")[1]);
ret.setType(split(p, "@")[0]);
return ret;
}).collect(Collectors.toList());
return ret;
})
.collect(Collectors.toList());
}
@ -213,6 +235,7 @@ public class SparkDumpResults implements Serializable {
if (ar.getOrcid() != null) {
ret.setOrcid(ar.getOrcid());
ret.setFromOrcid(ar.getFromOrcid());
} else {
ret.setOrcid("");
}
@ -220,27 +243,33 @@ public class SparkDumpResults implements Serializable {
return ret;
}
private static String getOrcid(List<StructuredProperty> pid) {
private static Tuple2<String, Boolean> getOrcid(List<StructuredProperty> pid) {
if (!Optional.ofNullable(pid).isPresent())
return null;
if (pid.size() == 0)
return null;
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
return p.getValue();
return new Tuple2<>(p.getValue(), Boolean.TRUE);
}
}
for (StructuredProperty p : pid) {
if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) {
return new Tuple2<>(p.getValue(), Boolean.FALSE);
}
}
return null;
}
private static String getFieldValue(Field<String> input){
private static String getFieldValue(Field<String> input) {
if (input != null &&
StringUtils.isNotEmpty(input.getValue())) {
StringUtils.isNotEmpty(input.getValue())) {
return input.getValue();
} else {
return "";
}
}
private static <R extends Result> CSVResult mapResultInfo(R r) {
CSVResult ret = new CSVResult();
ret.setId(r.getId());
@ -251,16 +280,24 @@ public class SparkDumpResults implements Serializable {
ret.setPublication_date(getFieldValue(r.getDateofacceptance()));
ret.setPublisher(getFieldValue(r.getPublisher()));
ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> {
if (StringUtils.isNotEmpty(s.getValue()))
return s.getValue().toLowerCase();
else
return null;}).filter(Objects::nonNull).distinct().collect(Collectors.toList())));
if (Optional.ofNullable(r.getSubject()).isPresent())
ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> {
if (StringUtils.isNotEmpty(s.getValue()))
return s.getValue().toLowerCase();
else
return null;
}).filter(Objects::nonNull).distinct().collect(Collectors.toList())));
else
ret.setKeywords("");
if (Optional.ofNullable(r.getCountry()).isPresent())
ret
.setCountry(
String.join(", ", r.getCountry().stream().map(Country::getClassid).collect(Collectors.toList())));
else
ret.setCountry("");
ret.setCountry(String.join(", ", r.getCountry().stream().map(Country::getClassid).collect(Collectors.toList())));
if (StringUtils.isNotEmpty(r.getLanguage().getClassid())) {
if (Optional.ofNullable(r.getLanguage()).isPresent() && StringUtils.isNotEmpty(r.getLanguage().getClassid())) {
ret.setLanguage(r.getLanguage().getClassid());
} else {
ret.setLanguage("");
@ -270,7 +307,7 @@ public class SparkDumpResults implements Serializable {
}
private static String getAbstract(List<Field<String>> description) {
if(description == null)
if (description == null)
return "";
for (Field<String> abs : description) {
if (StringUtils.isNotEmpty(abs.getValue())) {

View File

@ -1,109 +1,133 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult;
import eu.dnetlib.dhp.schema.oaf.*;
/**
* @author miriam.baglioni
* @Date 10/05/23
*/
//STEP 4
public class SparkMoveOnSigleDir implements Serializable {
//All the products saved in different directories are put under the same one.
// For the authors also a step of reconciliation mast be done, since the same author id can be saved in more that one directory
// All the products saved in different directories are put under the same one.
// For the authors also a step of reconciliation mast be done, since the same author id can be saved in more than
// one directory
private static final Logger log = LoggerFactory.getLogger(SparkMoveOnSigleDir.class);
private static final Logger log = LoggerFactory.getLogger(SparkMoveOnSigleDir.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkMoveOnSigleDir.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step2.json"));
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkMoveOnSigleDir.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste4.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, outputPath);
run(spark, outputPath, workingPath);
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath );
run(spark, outputPath, workingPath);
});
});
}
}
private static <R extends Result> void run(SparkSession spark, String outputPath,
String workingPath) {
private static <R extends Result> void run(SparkSession spark, String outputPath,
String workingPath) {
Utils
.readPath(spark, workingPath + "/publication/result", CSVResult.class)
.union(Utils.readPath(spark, workingPath + "/dataset/result", CSVResult.class))
.union(Utils.readPath(spark, workingPath + "/software/result", CSVResult.class))
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result", CSVResult.class))
.write()
.mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", Constants.SEP)
.option("compression", "gzip")
.csv(outputPath + "/result");
spark.read().textFile(workingPath + "/publication/result", workingPath + "/dataset/result", workingPath + "/software/result", workingPath + "/otherresearchproduct/result")
.write()
.mode(SaveMode.Overwrite)
.csv(outputPath + "/result");
Utils
.readPath(spark, workingPath + "/publication/result_pid", CSVPid.class)
.union(Utils.readPath(spark, workingPath + "/dataset/result_pid", CSVPid.class))
.union(Utils.readPath(spark, workingPath + "/software/result_pid", CSVPid.class))
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result_pid", CSVPid.class))
.write()
.mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", Constants.SEP)
.option("compression", "gzip")
.csv(outputPath + "/result_pid");
spark.read().textFile(workingPath + "/publication/result_pid", workingPath + "/dataset/result_pid", workingPath + "/software/result_pid", workingPath + "/otherresearchproduct/result_pid")
.write()
.mode(SaveMode.Overwrite)
.csv(outputPath + "/result_pid");
Utils
.readPath(spark, workingPath + "/publication/result_author", CSVRelResAut.class)
.union(Utils.readPath(spark, workingPath + "/dataset/result_author", CSVRelResAut.class))
.union(Utils.readPath(spark, workingPath + "/software/result_author", CSVRelResAut.class))
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result_author", CSVRelResAut.class))
.write()
.mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", Constants.SEP)
.option("compression", "gzip")
.csv(outputPath + "/result_author");
Utils
.readPath(spark, workingPath + "/publication/author", CSVAuthor.class)
.union(Utils.readPath(spark, workingPath + "/dataset/author", CSVAuthor.class))
.union(Utils.readPath(spark, workingPath + "/software/author", CSVAuthor.class))
.union(Utils.readPath(spark, workingPath + "/otherresearchproduct/author", CSVAuthor.class))
.groupByKey((MapFunction<CSVAuthor, String>) r -> r.getId(), Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, CSVAuthor, CSVAuthor>) (k, it) -> it.next(), Encoders.bean(CSVAuthor.class))
.write()
.mode(SaveMode.Overwrite)
.option("header", "true")
.option("delimiter", Constants.SEP)
.option("compression", "gzip")
.csv(outputPath + "/author");
spark.read().textFile(workingPath + "/publication/result_author", workingPath + "/dataset/result_author", workingPath + "/software/result_author", workingPath + "/otherresearchproduct/result_author")
.write()
.mode(SaveMode.Overwrite)
.csv(outputPath + "/result_author");
}
spark.read().textFile(workingPath + "/publication/result_author", workingPath + "/dataset/result_author", workingPath + "/software/result_author", workingPath + "/otherresearchproduct/result_author")
.groupByKey((MapFunction<String, String>) a -> a.split("\t")[0], Encoders.STRING())
.mapGroups((MapGroupsFunction<String, String, String>) (k, it) -> it.next(), Encoders.STRING() )
.write()
.mode(SaveMode.Overwrite)
.csv(outputPath + "/author");
}
}

View File

@ -4,10 +4,7 @@ package eu.dnetlib.dhp.oa.graph.dump.csv;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
@ -40,6 +37,7 @@ public class SparkSelectResultsAndDumpRelations implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class);
private static String RESULT_COMMUNITY_TABLE = "/result_community";
private static String COMMUNITY_RESULT_IDS = "/communityResultIds";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
@ -77,7 +75,7 @@ public class SparkSelectResultsAndDumpRelations implements Serializable {
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
Utils.removeOutputDir(spark, outputPath + RESULT_COMMUNITY_TABLE);
run(spark, inputPath, outputPath, workingPath, finalCommunityList);
});
@ -99,7 +97,6 @@ public class SparkSelectResultsAndDumpRelations implements Serializable {
spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList,
workingPath + COMMUNITY_RESULT_IDS);
// write the relations result communities
writeCommunityResultRelations(
spark, inputPath + "/publication", Publication.class, communityList, outputPath + RESULT_COMMUNITY_TABLE);
@ -109,7 +106,7 @@ public class SparkSelectResultsAndDumpRelations implements Serializable {
spark, inputPath + "/software", Software.class, communityList, outputPath + RESULT_COMMUNITY_TABLE);
writeCommunityResultRelations(
spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList,
outputPath + RESULT_COMMUNITY_TABLE);
outputPath + RESULT_COMMUNITY_TABLE);
// select the relations with semantics cites
org.apache.spark.sql.Dataset<Relation> relations = Utils
@ -148,8 +145,8 @@ public class SparkSelectResultsAndDumpRelations implements Serializable {
Encoders.bean(CSVCitation.class))
.write()
.option("compression", "gzip")
.option("header","true")
.option("delimiter", Constants.SEP)
.option("header", "true")
.option("delimiter", Constants.SEP)
.mode(SaveMode.Overwrite)
.csv(outputPath + "/relation");
@ -171,17 +168,24 @@ public class SparkSelectResultsAndDumpRelations implements Serializable {
(FilterFunction<R>) p -> !p.getDataInfo().getDeletedbyinference() &&
!p.getDataInfo().getInvisible())
.flatMap((FlatMapFunction<R, CSVRELCommunityResult>) p -> {
Set<String> inserted = new HashSet<>();
List<CSVRELCommunityResult> ret = new ArrayList<>();
for (String context :p.getContext().stream().map(Context::getId).distinct().collect(Collectors.toList())) {
for (String context : p
.getContext()
.stream()
.map(Context::getId)
.distinct()
.collect(Collectors.toList())) {
String cId = context.contains("::")
? context.substring(0, context.indexOf("::"))
: context;
if (communityList.contains(cId)) {
if (communityList.contains(cId) && !inserted.contains(cId)) {
CSVRELCommunityResult crc = new CSVRELCommunityResult();
crc.setResult_id(p.getId());
crc.setCommunity_id(DHPUtils.md5(cId));
ret.add(crc);
inserted.add(cId);
}
}
return ret.iterator();
@ -189,8 +193,8 @@ public class SparkSelectResultsAndDumpRelations implements Serializable {
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.option("header","true")
.option("delimiter",Constants.SEP)
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(outputPath);
}

View File

@ -13,6 +13,15 @@ public class CSVAuthor implements Serializable {
private String lastname;
private String fullname;
private String orcid;
private Boolean fromOrcid;
public Boolean getFromOrcid() {
return fromOrcid;
}
public void setFromOrcid(Boolean fromOrcid) {
this.fromOrcid = fromOrcid;
}
public String getId() {
return id;

View File

@ -1,4 +1,4 @@
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="dump_graph_csv" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
@ -65,22 +65,27 @@
</property>
</configuration>
</global>
<start to="fork_select_result" />
<start to="fork_dump_result_author_pid" />
<fork name="fork_select_result">
<path start="select_publication"/>
<path start="select_dataset"/>
<path start="select_orp"/>
<path start="select_software"/>
</fork>
<action name="dump_communities">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities</main-class>
<arg>--outputPath</arg><arg>${outputPath}/community</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--communities</arg><arg>${communities}</arg>
</java>
<ok to="select_result_dump_relation"/>
<error to="Kill"/>
</action>
<action name="select_publication">
<action name="select_result_dump_relation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>select results from publication </name>
<class>eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults</class>
<name>select results ids connected to communities and dump relation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.csv.SparkSelectResultsAndDumpRelations</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -93,15 +98,50 @@
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}</arg>
<arg>--workingPath</arg><arg>${outputPath}/workingDir</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--communities</arg><arg>${communities}</arg>
</spark>
<ok to="fork_dump_result_author_pid"/>
<error to="Kill"/>
</action>
<fork name="fork_dump_result_author_pid">
<path start="dump_publication"/>
<path start="dump_dataset"/>
<path start="dump_other"/>
<path start="dump_software"/>
</fork>
<action name="dump_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>select results from publication </name>
<class>eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=9G
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--workingPath</arg><arg>${outputPath}/workingDir</arg>
<arg>--resultType</arg><arg>publication</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="select_dataset">
<action name="dump_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -120,14 +160,14 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}</arg>
<arg>--communities</arg><arg>${communities}</arg>
<arg>--workingPath</arg><arg>${outputPath}/workingDir</arg>
<arg>--resultType</arg><arg>dataset</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="select_orp">
<action name="dump_other">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -146,14 +186,14 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}</arg>
<arg>--communities</arg><arg>${communities}</arg>
<arg>--workingPath</arg><arg>${outputPath}/workingDir</arg>
<arg>--resultType</arg><arg>otherresearchproduct</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<action name="select_software">
<action name="dump_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -172,24 +212,25 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}</arg>
<arg>--communities</arg><arg>${communities}</arg>
<arg>--workingPath</arg><arg>${outputPath}/workingDir</arg>
<arg>--resultType</arg><arg>software</arg>
</spark>
<ok to="join_dump"/>
<error to="Kill"/>
</action>
<join name="join_dump" to="dump_single_results"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="dump_project">
<action name="dump_single_results">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<name>Dump single results </name>
<class>eu.dnetlib.dhp.oa.graph.dump.csv.SparkMoveOnSigleDir</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -201,38 +242,15 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--outputPath</arg><arg>${workingDir}/project</arg>
<arg>--communityMapPath</arg><arg>noneed</arg>
<arg>--workingPath</arg><arg>${outputPath}/workingDir</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="get_new_projects"/>
<error to="Kill"/>
</action>
<action name="get_new_projects">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectsSubsetSparkJob</class>
<jar>dump-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/project</arg>
<arg>--outputPath</arg><arg>${workingDir}/tar/project</arg>
<arg>--projectListPath</arg><arg>${projectListPath}</arg>
</spark>
<ok to="make_archive"/>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="make_archive">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.MakeTar</main-class>

View File

@ -0,0 +1,30 @@
[
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "nameNode",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": true
},
{
"paramName":"ilu",
"paramLongName":"isLookUpUrl",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName":"c",
"paramLongName":"communities",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
}
]

View File

@ -6,12 +6,7 @@
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",

View File

@ -0,0 +1,25 @@
[
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"wp",
"paramLongName":"workingPath",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName":"o",
"paramLongName":"outputPath",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
}
]

View File

@ -1,6 +1,6 @@
for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')
where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']
and ($x//CONFIGURATION/context[./@id='dh-ch'] or $x//CONFIGURATION/context[./@id='dariah'] or $x//CONFIGURATION/context[./@id='enermaps'] or $x//CONFIGURATION/context[./@id='beopen'])
and (%s)
return
<community>
{$x//CONFIGURATION/context/@id}

View File

@ -1,15 +1,26 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.utils.DHPUtils;
import static org.apache.commons.lang3.StringUtils.split;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Optional;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
@ -17,253 +28,310 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Optional;
import com.fasterxml.jackson.databind.ObjectMapper;
import static org.apache.commons.lang3.StringUtils.split;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.utils.DHPUtils;
/**
* @author miriam.baglioni
* @Date 11/05/23
*/
public class DumpResultTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static SparkSession spark;
private static Path workingDir;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(DumpResultTest.class);
private static final Logger log = LoggerFactory
.getLogger(DumpResultTest.class);
private static HashMap<String, String> map = new HashMap<>();
private static HashMap<String, String> map = new HashMap<>();
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(DumpResultTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(DumpResultTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(DumpResultTest.class.getSimpleName());
SparkConf conf = new SparkConf();
conf.setAppName(DumpResultTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(DumpResultTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
spark = SparkSession
.builder()
.appName(DumpResultTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testDumpResult() throws Exception {
@Test
public void testDumpResult() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/")
.getPath();
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/")
.getPath();
spark.read().text(getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
spark
.read()
.text(
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
SparkDumpResults.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingDir.toString() + "/working",
"-resultType", "publication",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-sourcePath", sourcePath
});
SparkDumpResults.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingDir.toString() + "/working",
"-resultType", "publication",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-sourcePath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Dataset<Row> tmp = spark
.read()
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/working/publication/result");
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Assertions.assertEquals(3, tmp.count());
Row row = tmp
.where("id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'")
.first();
Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright"));
Assertions.assertEquals("FI", row.getAs("country"));
Assertions.assertEquals("Lit.opg., bijl.", row.getAs("description"));
Assertions.assertEquals(3, split(row.getAs("keywords"), ", ").length);
Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie"));
Assertions.assertTrue(row.getAs("keywords").toString().contains("prospectie"));
Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology"));
Assertions.assertEquals("nl", row.getAs("language"));
Assertions.assertEquals("2007-01-01", row.getAs("publication_date"));
Assertions.assertEquals("FakePublisher1", row.getAs("publisher"));
Assertions
.assertEquals(
"Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel",
row.getAs("title"));
Assertions.assertEquals("publication", row.getAs("type"));
Dataset<Row> tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/working/publication/result");
row = tmp
.where("id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'")
.first();
Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright"));
Assertions.assertEquals(2, split(row.getAs("country"), ", ").length);
Assertions.assertNull(row.getAs("description"));
Assertions.assertEquals(2, split(row.getAs("keywords"), ", ").length);
Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie"));
Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology"));
Assertions.assertEquals("UNKNOWN", row.getAs("language"));
Assertions.assertNull(row.getAs("publication_date"));
Assertions.assertNull(row.getAs("publisher"));
Assertions.assertEquals("None", row.getAs("title"));
Assertions.assertEquals("publication", row.getAs("type"));
Assertions.assertEquals(3, tmp.count());
Row row = tmp
.where("id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'")
.first();
Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright"));
Assertions.assertEquals("FI" ,row.getAs("country"));
Assertions.assertEquals("Lit.opg., bijl." ,row.getAs("description"));
Assertions.assertEquals(3 ,split(row.getAs("keywords"), ", ").length);
Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie"));
Assertions.assertTrue(row.getAs("keywords").toString().contains("prospectie"));
Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology"));
Assertions.assertEquals("nl", row.getAs("language"));
Assertions.assertEquals("2007-01-01", row.getAs("publication_date"));
Assertions.assertEquals("FakePublisher1", row.getAs("publisher"));
Assertions.assertEquals("Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel", row.getAs("title"));
Assertions.assertEquals("publication", row.getAs("type"));
row = tmp
.where("id = '50|DansKnawCris::26780065282e607306372abd0d808245'")
.first();
Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright"));
Assertions.assertNull(row.getAs("country"));
Assertions.assertNull(row.getAs("description"));
Assertions.assertEquals(2, split(row.getAs("keywords"), ", ").length);
Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie"));
Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology"));
Assertions.assertEquals("UNKNOWN", row.getAs("language"));
Assertions.assertNull(row.getAs("publication_date"));
Assertions.assertNull(row.getAs("publisher"));
Assertions.assertEquals("None", row.getAs("title"));
Assertions.assertEquals("publication", row.getAs("type"));
row = tmp
.where("id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'")
.first();
Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright"));
Assertions.assertEquals(2 ,split(row.getAs("country"), ", ").length);
Assertions.assertNull(row.getAs("description"));
Assertions.assertEquals(2 ,split(row.getAs("keywords"), ", ").length);
Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie"));
Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology"));
Assertions.assertEquals("UNKNOWN", row.getAs("language"));
Assertions.assertNull( row.getAs("publication_date"));
Assertions.assertNull( row.getAs("publisher"));
Assertions.assertEquals("None", row.getAs("title"));
Assertions.assertEquals("publication", row.getAs("type"));
}
row = tmp
.where("id = '50|DansKnawCris::26780065282e607306372abd0d808245'")
.first();
Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright"));
Assertions.assertNull(row.getAs("country"));
Assertions.assertNull(row.getAs("description"));
Assertions.assertEquals(2 ,split(row.getAs("keywords"), ", ").length);
Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie"));
Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology"));
Assertions.assertEquals("UNKNOWN", row.getAs("language"));
Assertions.assertNull( row.getAs("publication_date"));
Assertions.assertNull( row.getAs("publisher"));
Assertions.assertEquals("None", row.getAs("title"));
Assertions.assertEquals("publication", row.getAs("type"));
@Test
public void testDumpAuthor() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/")
.getPath();
}
spark
.read()
.text(
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
@Test
public void testDumpAuthor() throws Exception {
SparkDumpResults.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingDir.toString() + "/working",
"-resultType", "publication",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-sourcePath", sourcePath
});
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/")
.getPath();
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
spark.read().text(getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
Dataset<Row> tmp = spark
.read()
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/working/publication/author");
SparkDumpResults.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingDir.toString() + "/working",
"-resultType", "publication",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-sourcePath", sourcePath
});
Assertions.assertEquals(5, tmp.count());
Assertions.assertEquals(1, tmp.where("firstName == 'Maryam'").count());
Assertions
.assertEquals(
DHPUtils.md5("50|DansKnawCris::0224aae28af558f21768dbc6439c7a951"),
tmp.where("firstName == 'Maryam'").first().getAs("id"));
Assertions
.assertEquals(DHPUtils.md5("0000-0003-2914-2734"), tmp.where("firstName == 'Michael'").first().getAs("id"));
Assertions
.assertEquals(
DHPUtils.md5("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d92"),
tmp.where("firstName == 'Mikhail'").first().getAs("id"));
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
Dataset<Row> tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/working/publication/author");
@Test
public void testDumpResultAuthorRelations() throws Exception {
Assertions.assertEquals(5, tmp.count());
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/")
.getPath();
Assertions.assertEquals(1,tmp.where("firstName == 'Maryam'").count());
spark
.read()
.text(
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
Assertions.assertEquals(DHPUtils.md5("50|DansKnawCris::0224aae28af558f21768dbc6439c7a951"),tmp.where("firstName == 'Maryam'").first().getAs("id"));
Assertions.assertEquals(DHPUtils.md5("0000-0003-2914-2734"),tmp.where("firstName == 'Michael'").first().getAs("id"));
Assertions.assertEquals(DHPUtils.md5("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d92"),tmp.where("firstName == 'Mikhail'").first().getAs("id"));
SparkDumpResults.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingDir.toString() + "/working",
"-resultType", "publication",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-sourcePath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
}
Dataset<Row> tmp = spark
.read()
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/working/publication/result_author");
@Test
public void testDumpResultAuthorRelations() throws Exception {
Assertions.assertEquals(6, tmp.count());
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/")
.getPath();
Assertions.assertEquals(2, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'").count());
Assertions
.assertEquals(
1, tmp
.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'")
.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'")
.count());
Assertions
.assertEquals(
1, tmp
.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'")
.where("result_id == '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'")
.count());
spark.read().text(getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
}
SparkDumpResults.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingDir.toString() + "/working",
"-resultType", "publication",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-sourcePath", sourcePath
});
@Test
public void testDumpResultPid() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/")
.getPath();
spark
.read()
.text(
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
SparkDumpResults.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingDir.toString() + "/working",
"-resultType", "publication",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-sourcePath", sourcePath
});
Dataset<Row> tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/working/publication/result_author");
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Assertions.assertEquals(6, tmp.count());
Dataset<Row> tmp = spark
.read()
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/working/publication/result_pid");
Assertions.assertEquals(2, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'").count());
Assertions.assertEquals(1, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'")
.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'").count());
Assertions.assertEquals(1, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'")
.where("result_id == '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'").count());
tmp.show(false);
Assertions.assertEquals(4, tmp.count());
Assertions
.assertEquals(2, tmp.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'").count());
Assertions
.assertEquals(
"10.1023/fakedoi",
tmp
.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' and type == 'doi'")
.first()
.getAs("pid"));
}
}
@Test
public void testDumpResultPid() throws Exception {
@Test
public void prova() throws DocumentException {
String input = "<community id=\"dh-ch\" label=\"Digital Humanities and Cultural Heritage\">" +
" <description>This community gathers research results, data, scientific publications and projects related to the domain of Digital Humanities. This broad definition includes Humanities, Cultural Heritage, History, Archaeology and related fields.</description>"
+
"</community>";
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/")
.getPath();
final Document doc;
final SAXReader reader = new SAXReader();
spark.read().text(getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
SparkDumpResults.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingDir.toString() + "/working",
"-resultType", "publication",
"-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication",
"-sourcePath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Dataset<Row> tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/working/publication/result_pid");
tmp.show(false);
Assertions.assertEquals(4, tmp.count());
Assertions.assertEquals(2, tmp.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'").count());
Assertions.assertEquals("10.1023/fakedoi", tmp.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' and type == 'doi'").first().getAs("pid"));
}
doc = reader.read(new StringReader(input));
Element root = doc.getRootElement();
StringBuilder builder = new StringBuilder();
builder.append(DHPUtils.md5(root.attribute("id").getValue()));
builder.append(Constants.SEP);
builder.append(root.attribute("label").getValue());
builder.append(Constants.SEP);
builder.append(root.attribute("id").getValue());
builder.append(Constants.SEP);
builder.append(((Node) (root.selectNodes("//description").get(0))).getText());
System.out.println(builder.toString());
}
}

View File

@ -0,0 +1,117 @@
package eu.dnetlib.dhp.oa.graph.dump.csv;
import static org.apache.commons.lang3.StringUtils.split;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
/**
* @author miriam.baglioni
* @Date 25/05/23
*/
public class MoveOnSingleDirTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(MoveOnSingleDirTest.class);
private static HashMap<String, String> map = new HashMap<>();
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(MoveOnSingleDirTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(MoveOnSingleDirTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(MoveOnSingleDirTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void testDMoveSingleDir() throws Exception {
final String workingPath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working")
.getPath();
spark
.read()
.text(
getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds")
.getPath())
.write()
.text(workingDir.toString() + "/working/resultIds/");
SparkMoveOnSigleDir.main(new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-outputPath", workingDir.toString() + "/output",
"-workingPath", workingPath
});
Dataset<Row> tmp = spark
.read()
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/output/result");
Assertions.assertEquals(21, tmp.count());
Assertions.assertEquals(12, tmp.filter("type == 'dataset'").count());
Assertions.assertEquals(4, tmp.filter("type == 'other'").count());
Assertions.assertEquals(4, tmp.filter("type == 'publication'").count());
Assertions.assertEquals(1, tmp.filter("type == 'software'").count());
Assertions
.assertEquals(
8, spark
.read()
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/output/author")
.count());
}
}

View File

@ -6,8 +6,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -26,6 +24,8 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult;
import eu.dnetlib.dhp.utils.DHPUtils;
/**
* @author miriam.baglioni
@ -88,70 +88,134 @@ public class SelectResultAndDumpRelationTest {
"-sourcePath", sourcePath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
Assertions.assertEquals(2, sc.textFile(workingDir.toString() + "/working/communityResultIds").count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/working/communityResultIds")
.filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
.count());
Assertions.assertEquals(2,sc.textFile(workingDir.toString() + "/working/communityResultIds").count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/working/communityResultIds")
.filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
.count());
Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/communityResultIds")
.filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")).count());
Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/communityResultIds")
.filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).count());
//verify that the association is correct with the communityid and result id
spark.read().option("header", "true").option("delimiter",Constants.SEP).csv(workingDir.toString() + "/output/result_community")
.createOrReplaceTempView("result_community");
// verify that the association is correct with the communityid and result id
spark
.read()
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/output/result_community")
.createOrReplaceTempView("result_community");
Assertions.assertEquals(3, spark.sql("SELECT * FROM result_community").count());
Assertions.assertEquals(1, spark.sql("SELECT * " +
"FROM result_community " +
"WHERE community_id = '" + DHPUtils.md5("dh-ch") + "'").count());
Assertions
.assertEquals(
1, spark
.sql(
"SELECT * " +
"FROM result_community " +
"WHERE community_id = '" + DHPUtils.md5("dh-ch") + "'")
.count());
Assertions.assertEquals(1, spark.sql("SELECT * " +
"FROM result_community" +
" WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " +
"AND community_id = '" + DHPUtils.md5("dh-ch") + "'").count());
Assertions.assertEquals(2, spark.sql("SELECT * " +
"FROM result_community " +
"WHERE community_id = '" + DHPUtils.md5("enermaps") + "'").count());
Assertions.assertEquals(1, spark.sql("SELECT * " +
"FROM result_community " +
"WHERE result_id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9' " +
"AND community_id = '" + DHPUtils.md5("enermaps") + "'").count());
Assertions.assertEquals(1, spark.sql("SELECT * " +
"FROM result_community " +
"WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " +
"AND community_id = '" + DHPUtils.md5("enermaps") + "'").count());
Assertions
.assertEquals(
1, spark
.sql(
"SELECT * " +
"FROM result_community" +
" WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " +
"AND community_id = '" + DHPUtils.md5("dh-ch") + "'")
.count());
Assertions
.assertEquals(
2, spark
.sql(
"SELECT * " +
"FROM result_community " +
"WHERE community_id = '" + DHPUtils.md5("enermaps") + "'")
.count());
Assertions
.assertEquals(
1, spark
.sql(
"SELECT * " +
"FROM result_community " +
"WHERE result_id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9' " +
"AND community_id = '" + DHPUtils.md5("enermaps") + "'")
.count());
Assertions
.assertEquals(
1, spark
.sql(
"SELECT * " +
"FROM result_community " +
"WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " +
"AND community_id = '" + DHPUtils.md5("enermaps") + "'")
.count());
Assertions.assertEquals(3, spark.read().textFile(workingDir.toString() + "/working/resultIds").count());
Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds")
.filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")).count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/working/resultIds")
.filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
.count());
Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds")
.filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/working/resultIds")
.filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"))
.count());
Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds")
.filter(v -> v.equals("50|DansKnawCris::26780065282e607306372abd0d808245")).count());
Assertions
.assertEquals(
1, sc
.textFile(workingDir.toString() + "/working/resultIds")
.filter(v -> v.equals("50|DansKnawCris::26780065282e607306372abd0d808245"))
.count());
spark.read().option("header", "true").option("delimiter",Constants.SEP).csv(workingDir.toString() + "/output/relation")
.createOrReplaceTempView("relation");
spark
.read()
.option("header", "true")
.option("delimiter", Constants.SEP)
.csv(workingDir.toString() + "/output/relation")
.createOrReplaceTempView("relation");
Assertions.assertEquals(2, spark.sql("SELECT * FROM relation").count());
Assertions.assertEquals(1, spark.sql("SELECT * FROM relation WHERE id = '" +
DHPUtils.md5(("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::26780065282e607306372abd0d808245")) + "'").count());
Assertions
.assertEquals(
1, spark
.sql(
"SELECT * FROM relation WHERE id = '" +
DHPUtils
.md5(
("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::26780065282e607306372abd0d808245"))
+ "'")
.count());
Assertions.assertEquals(1, spark.sql("SELECT * FROM relation WHERE id = '" +
DHPUtils.md5(("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) + "'").count());
Assertions
.assertEquals(
1, spark
.sql(
"SELECT * FROM relation WHERE id = '" +
DHPUtils
.md5(
("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"))
+ "'")
.count());
}
}
}

View File

@ -0,0 +1,4 @@
{"fullname":"Giovanni Aloisio","id":"5ac035663df4d9099cf92d0e3f22a964","orcid":""}
{"fullname":"Cosimo Palazzo","id":"9f0d3123b6390dd7b2f3cee66c6bc926","orcid":""}
{"firstname":"L","fullname":"L, Issel-Tarver","id":"bafb7637b5f1c692419e55b13bf719a3","lastname":"Issel-Tarver","orcid":""}
{"firstname":"Voula","fullname":"Giouli, Voula","id":"c80f55a9afb32ffc4bc6bb67b6e0df33","lastname":"Giouli","orcid":""}

View File

@ -0,0 +1,12 @@
{"accessright":"UNKNOWN","country":"","description":"Absidiole NE_face ext","id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816","keywords":"chevet, fenêtre, façade","language":"und","publication_date":"2019-01-01","publisher":"Nakala by Huma-Num","title":"QS83_17_Absidiole NE_face ext.jpg","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::0676bf8b1f33afc121ac4f28e1c3d8ad","keywords":"kiu38; http://sith.huma-num.fr/karnak/38","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 69534. Karnak, KIU 38 / stèle denceinte de ramsès iii XXe dynastie / Ramses III","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::0b92f33d78d42f54084145b91500941a","keywords":"kiu2869; http://sith.huma-num.fr/karnak/2869","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 8263. Karnak, KIU 2869 / Cour à portique de Thoutmosis IV, Scene, piliers, pilier 03 est : accolade XVIIIe dynastie / Thoutmosis IV","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::157349520d61226da5d85e0856bdae3e","keywords":"kiu4635; http://sith.huma-num.fr/karnak/4635","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 171030. Karnak, KIU 4635 / Cour nord du IVe pylône porte sud-est, face nord, montants est XVIIIe dynastie / Thoutmosis III","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::18b2aa2b1b9a2a11da84bc8e1f662070","keywords":"kiu4225; http://sith.huma-num.fr/karnak/4225, kiu4217; http://sith.huma-num.fr/karnak/4217, kiu4218; http://sith.huma-num.fr/karnak/4218","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 151603. Karnak, KIU 4217 / Temple dOpet, Soubassement, face extérieure est, soubassement, 1er registre sud 10.n (opet 213 gauche) Romains / Auguste","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::31f713b5670d801de154453ea68ff4e1","keywords":"kiu3479; http://sith.huma-num.fr/karnak/3479","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 198480. Karnak, KIU 3479 / VIe pylône, Scene, mur intérieur est, partie nord 3.s annales (vi) : XVIIIe dynastie / Thoutmosis III","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::358b383fd0975b292edafd6b1d1fe9a2","keywords":"op179; http://sith.huma-num.fr/karnak/op179, kiu1114; http://sith.huma-num.fr/karnak/1114","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 135670. Karnak, KIU 1114 / Temple de Ptah, Objet, objet(s) découvert(s) porte de grenier XVIIe dynastie / SenakhtenRe","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::4cc834e3443c27cb7b0100a470c5c7f9","keywords":"kiu7329; http://sith.huma-num.fr/karnak/7329, kiu7330; http://sith.huma-num.fr/karnak/7330, kiu7331; http://sith.huma-num.fr/karnak/7331","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 169666. Karnak, KIU 7330 / Salle hypostyle colonnes, côté sud, colonne 017, fût frise XXe dynastie / Ramses IV","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::516950eba1c6737cbe26a52401b3fb2c","keywords":"kiu2185; http://sith.huma-num.fr/karnak/2185","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 128938. Karnak, KIU 2185 / « Magasin pur » de Khonsou, Objet porte fragmentaire du « magasin pur » de khonsou Ptolemees / Ptolemee Evergete Ier","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::71182950600db8e6aff20566f9df0345","keywords":"kiu4212; http://sith.huma-num.fr/karnak/4212","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 151470. Karnak, KIU 4212 / Temple dOpet, Scene, face extérieure est, soubassement, 1er registre sud 04.n (opet 210 gauche) : procession de nils Romains / Auguste","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::99592f6a6bc8b9b67b0d8f1612e310bb","keywords":"kiu3939; http://sith.huma-num.fr/karnak/3939, kiu3822; http://sith.huma-num.fr/karnak/3822, kiu3823; http://sith.huma-num.fr/karnak/3823, kiu3825; http://sith.huma-num.fr/karnak/3825","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 141190. Karnak, KIU 3939 / Temple dOpet face extérieure sud, soubassement, 1er registre bandeau (opet 266-267) Romains / Auguste","type":"dataset"}
{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::abd19ac4153416d0eb73b8f2e7612d35","keywords":"kiu5592; http://sith.huma-num.fr/karnak/5592, kiu8128; http://sith.huma-num.fr/karnak/8128, kiu8129; http://sith.huma-num.fr/karnak/8129, kiu8130; http://sith.huma-num.fr/karnak/8130","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 167789. Karnak","type":"dataset"}

View File

@ -0,0 +1,19 @@
{"author_id":"54ecb1d939e05ac0542d6af377100e67","result_id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816"}
{"author_id":"06706770e1fb3b89fea4d0a8a60e7809","result_id":"50|r3f5b9831893::0b92f33d78d42f54084145b91500941a"}
{"author_id":"3afe02a6563ca7c30df007d69645f730","result_id":"50|r3f5b9831893::18b2aa2b1b9a2a11da84bc8e1f662070"}
{"author_id":"440464bc227f8371c905779a4641d49a","result_id":"50|r3f5b9831893::31f713b5670d801de154453ea68ff4e1"}
{"author_id":"3d0c4aa051cdc1cc71907a973f616767","result_id":"50|r3f5b9831893::4cc834e3443c27cb7b0100a470c5c7f9"}
{"author_id":"874398e3c71ba2e8cf76de4ba458d5fb","result_id":"50|r3f5b9831893::516950eba1c6737cbe26a52401b3fb2c"}
{"author_id":"fe165c3a039f1cc4301c9dbd7c7f2247","result_id":"50|r3f5b9831893::71182950600db8e6aff20566f9df0345"}
{"author_id":"b3b2b99a02b1bbd8d4b5a1994b8d60fe","result_id":"50|r3f5b9831893::99592f6a6bc8b9b67b0d8f1612e310bb"}
{"author_id":"be12aee5482275608067a3cab9e8beb6","result_id":"50|r3f5b9831893::abd19ac4153416d0eb73b8f2e7612d35"}
{"author_id":"dde164aefcd3aebafec84feedd999170","result_id":"50|r3f5b9831893::b26848c3000fbd7153e2fdeaf3d70bd2"}
{"author_id":"3a55a188e8a23e645752055ff18d4720","result_id":"50|r3f5b9831893::b94d49cfb4ea230b784be1fe24f0edd5"}
{"author_id":"a0bcddc2a41a4cc0dd768eced4dd0939","result_id":"50|r3f5b9831893::ef9f1724cef04a9f62bdf90d9084d70b"}
{"author_id":"51b2a67f20cdfd9628233ebf04158468","result_id":"50|r3f5b9831893::f349ea5bdd91d846e70e6a4a3c71ccd6"}
{"author_id":"dfad2f4b741f4fbac8f504dd0088db06","result_id":"50|r3f5b9831893::f82af1f6dfd2b8644ba3ab799285849f"}
{"author_id":"b52f90003de8e73f2f704ced12b83bba","result_id":"50|r3f5b9831893::fb7cf14ef55474c3b745262fea21d4c0"}
{"author_id":"08e7328f7c44b32e1203374aadbedf0c","result_id":"50|doi_dedup___::c7a29e095e1763e09af2eb0e2ffbb717"}
{"author_id":"c8c6c6273e798cf408f848afd8ca13f8","result_id":"50|r3f5b9831893::0bc48082a3803d837098447a4f8fb28d"}
{"author_id":"16d0306f0af215d9ec8f70660026d585","result_id":"50|r3f5b9831893::1a372b7640db956b13716fc5e7b455b7"}
{"author_id":"c0a97e8f55967dedb4a57125e3174816","result_id":"50|r3f5b9831893::1b8dec9230423314146858112059845d"}

View File

@ -0,0 +1,33 @@
{"id":"58c75fe64b4df0126e0e4fdfafb8be18","pid":"http://hdl.handle.net/11280/86e6ac0d","result_id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816","type":"handle"}
{"id":"45c62956554c7d3e7f9708bce5c9a086","pid":"11280/86e6ac0d","result_id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816","type":"handle"}
{"id":"312a5c89fa6d82ccc66c1b9615d3d364","pid":"10.34847/nkl.7f846pnw","result_id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816","type":"doi"}
{"id":"cb29ee70d77746445ca5ce5f121bc473","pid":"http://hdl.handle.net/11280/747fab4a","result_id":"50|r3f5b9831893::0676bf8b1f33afc121ac4f28e1c3d8ad","type":"handle"}
{"id":"45a465d38aabff009c0fcf41c2f08c67","pid":"11280/747fab4a","result_id":"50|r3f5b9831893::0676bf8b1f33afc121ac4f28e1c3d8ad","type":"handle"}
{"id":"cc956040bd5031ecec943d91e8b764fb","pid":"11280/51909d00","result_id":"50|r3f5b9831893::0b92f33d78d42f54084145b91500941a","type":"handle"}
{"id":"726c5eef33521e505ef9cb48fe75d596","pid":"http://hdl.handle.net/11280/51909d00","result_id":"50|r3f5b9831893::0b92f33d78d42f54084145b91500941a","type":"handle"}
{"id":"32429dfa16fa2847b0286efaf0a0dce8","pid":"11280/fc581aa4","result_id":"50|r3f5b9831893::157349520d61226da5d85e0856bdae3e","type":"handle"}
{"id":"554994db0c44fe13283444e190ac9607","pid":"http://hdl.handle.net/11280/fc581aa4","result_id":"50|r3f5b9831893::157349520d61226da5d85e0856bdae3e","type":"handle"}
{"id":"88a301e2cadf5e691ebb6a5665eb78f4","pid":"http://hdl.handle.net/11280/1cfc2896","result_id":"50|r3f5b9831893::18b2aa2b1b9a2a11da84bc8e1f662070","type":"handle"}
{"id":"2f15200f24a870ff9edb3913e292d61f","pid":"11280/1cfc2896","result_id":"50|r3f5b9831893::18b2aa2b1b9a2a11da84bc8e1f662070","type":"handle"}
{"id":"027c0e2083ab8ea468469a34fe9d46e1","pid":"http://hdl.handle.net/11280/3b2225c5","result_id":"50|r3f5b9831893::31f713b5670d801de154453ea68ff4e1","type":"handle"}
{"id":"8466cbb68b2d1c541b056006b7f27ea4","pid":"11280/3b2225c5","result_id":"50|r3f5b9831893::31f713b5670d801de154453ea68ff4e1","type":"handle"}
{"id":"bac82482f2dba75f8e34802ed7789554","pid":"http://hdl.handle.net/11280/f3911908","result_id":"50|r3f5b9831893::358b383fd0975b292edafd6b1d1fe9a2","type":"handle"}
{"id":"8cd4bb9ef9c8007155a95ee9df90ea69","pid":"11280/f3911908","result_id":"50|r3f5b9831893::358b383fd0975b292edafd6b1d1fe9a2","type":"handle"}
{"id":"ba83be852322c4c86ed6b3ab0610987d","pid":"11280/65056b94","result_id":"50|r3f5b9831893::4cc834e3443c27cb7b0100a470c5c7f9","type":"handle"}
{"id":"93cd2ffff769223cf04034e0db0f6284","pid":"http://hdl.handle.net/11280/65056b94","result_id":"50|r3f5b9831893::4cc834e3443c27cb7b0100a470c5c7f9","type":"handle"}
{"id":"c5dcb6dab6f53a281f96bfbe048858ce","pid":"http://hdl.handle.net/11280/dac5fe22","result_id":"50|r3f5b9831893::516950eba1c6737cbe26a52401b3fb2c","type":"handle"}
{"id":"999076fd410cdb0c1599b7d5e355b94a","pid":"11280/dac5fe22","result_id":"50|r3f5b9831893::516950eba1c6737cbe26a52401b3fb2c","type":"handle"}
{"id":"ef68e036a7e753da17a2794ccf1b8ce5","pid":"http://hdl.handle.net/11280/446e3387","result_id":"50|r3f5b9831893::71182950600db8e6aff20566f9df0345","type":"handle"}
{"id":"5377b0f0143c324176bbee897d9d966c","pid":"11280/446e3387","result_id":"50|r3f5b9831893::71182950600db8e6aff20566f9df0345","type":"handle"}
{"id":"9e588201f52f05fca56efc43583ca615","pid":"http://hdl.handle.net/11280/969ae30a","result_id":"50|r3f5b9831893::99592f6a6bc8b9b67b0d8f1612e310bb","type":"handle"}
{"id":"f64681856cadef587b4c34396e9e6861","pid":"11280/969ae30a","result_id":"50|r3f5b9831893::99592f6a6bc8b9b67b0d8f1612e310bb","type":"handle"}
{"id":"4ad4d6c56ce6e206c42849df92d894f5","pid":"http://hdl.handle.net/11280/dddf5851","result_id":"50|r3f5b9831893::abd19ac4153416d0eb73b8f2e7612d35","type":"handle"}
{"id":"0b3ea2f9c96eb9593fd9b21363b7d9f6","pid":"11280/dddf5851","result_id":"50|r3f5b9831893::abd19ac4153416d0eb73b8f2e7612d35","type":"handle"}
{"id":"45dc28539b305d186f51d5ee9465aee0","pid":"http://hdl.handle.net/11280/3f2679d9","result_id":"50|r3f5b9831893::b26848c3000fbd7153e2fdeaf3d70bd2","type":"handle"}
{"id":"b9c5beb054f3ca72477cb1b07351196a","pid":"11280/3f2679d9","result_id":"50|r3f5b9831893::b26848c3000fbd7153e2fdeaf3d70bd2","type":"handle"}
{"id":"ee0120c72b2f9c1fc1dd3cf47c98ac9d","pid":"http://hdl.handle.net/11280/d957e9f3","result_id":"50|r3f5b9831893::b94d49cfb4ea230b784be1fe24f0edd5","type":"handle"}
{"id":"4770ff66784a0b9470551d46e7a0aaa0","pid":"11280/d957e9f3","result_id":"50|r3f5b9831893::b94d49cfb4ea230b784be1fe24f0edd5","type":"handle"}
{"id":"3cf2316ff497fda37d07757e72307173","pid":"11280/e8d8ed9f","result_id":"50|r3f5b9831893::ef9f1724cef04a9f62bdf90d9084d70b","type":"handle"}
{"id":"5a9092d335d45be6d01f9d6af99c9d86","pid":"http://hdl.handle.net/11280/e8d8ed9f","result_id":"50|r3f5b9831893::ef9f1724cef04a9f62bdf90d9084d70b","type":"handle"}
{"id":"37018c7be9823e3c49aeff0e9ae69054","pid":"http://hdl.handle.net/11280/9ff65944","result_id":"50|r3f5b9831893::f349ea5bdd91d846e70e6a4a3c71ccd6","type":"handle"}
{"id":"c372305e06eacc7855c7de0e3fc6df07","pid":"11280/9ff65944","result_id":"50|r3f5b9831893::f349ea5bdd91d846e70e6a4a3c71ccd6","type":"handle"}

View File

@ -0,0 +1,2 @@
{"firstname":"Taal En Spraaktechnologie","fullname":"LS OZ Taal en spraaktechnologie","id":"60fa4ab9fa107f5281b91c1db2885bf9","lastname":"Ls Oz","orcid":""}
{"fullname":"Nispen, van, Annelies","id":"1279ef1ced7366cc6af25a2079ab4554","orcid":""}

View File

@ -0,0 +1,4 @@
{"accessright":"OPEN","country":"","description":"","id":"50|core_ac_uk__::15d72bdde1addf525170aa61664f8daf","keywords":"","language":"eng","publication_date":"","publisher":"Springer International Publishing","title":"Reengineering and Reinventing both Democracy and the Concept of Life in the Digital Era","type":"other"}
{"accessright":"OPEN","country":"IT","description":"","id":"50|od______3686::b0cb086c9a0222684d48b3e355eba1c8","keywords":"","language":"und","publication_date":"2002-01-01","publisher":"","title":"Progetto dellimpianto eolico di Pescopagano (Potenza), progetto secondo classificato al Concorso nazionale “Paesaggi del Vento”, progetto pubblicato in: E. Zanchini , a cura di, Paesaggi del vento, Meltemi, Roma 2002 , pp.84-89","type":"other"}
{"accessright":"OPEN","country":"NL","description":"This article reports about the on-going work on a new version of the metadata framework Component Metadata Infrastructure (CMDI), central to the CLARIN infrastructure. Version 1.2 introduces a number of important changes based on the experience gathered in the last five years of intensive use of CMDI by the digital humanities community, addressing problems encountered, but also introducing new functionality. Next to the consolidation of the structure of the model and schema sanity, new means for lifecycle management have been introduced aimed at combatting the observed proliferation of components, new mechanism for use of external vocabularies will contribute to more consistent use of controlled values and cues for tools will allow improved presentation of the metadata records to the human users. The feature set has been frozen and approved, and the infrastructure is now entering a transition phase, in which all the tools and data need to be migrated to the new version.","id":"50|narcis______::07cab979c27c9240f7ef5d80d752679b","keywords":"","language":"eng","publication_date":"2015-08-26","publisher":"Linköping University Electronic Press, Linköpings universitet","title":"CMDI 1.2: Improvements in the CLARIN Component Metadata Infrastructure","type":"other"}
{"accessright":"OPEN","country":"NL","description":"This paper describes what the CLARIN infrastructure is and how it can be used, with a focus on the Netherlands part of the CLARIN infrastructure. It aims to explain how a humanities researcher can use the CLARIN infrastructure.","id":"50|narcis______::655f9ef445ffa66a1782f29208cc1569","keywords":"","language":"eng","publication_date":"2014-08-20","publisher":"UiL OTS","title":"The CLARIN infrastructure in the Netherlands: What is it and how can you use it?","type":"other"}

View File

@ -0,0 +1,17 @@
{"author_id":"af07dd90a1f0be8159e52f7f572d1c5c","result_id":"50|narcis______::14afd8c5c46d17af87ceef410ab25e01"}
{"author_id":"9f24c2ed6e1cb057772b641806ae77ec","result_id":"50|narcis______::14afd8c5c46d17af87ceef410ab25e01"}
{"author_id":"9ad1701184de323823fc1a858a868ac2","result_id":"50|narcis______::14afd8c5c46d17af87ceef410ab25e01"}
{"author_id":"de106449e38166d8cf2ac7bb7bb6c5d8","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"8a157b06eaaf9fbca8b67011bc374744","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"10bffdada7578cec278ba1a5e3d63da5","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"d2a8ebfa553c4f6ff90998bd1c58fbcc","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"86b929edfab2d532f075506559a6ac76","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"478c134423c1afa8bb2ee174014726af","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"ba92d49768133c928d102eb86cb3690c","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"d590f7127b93a0b6003cbed3bd20983b","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"c146c73851641e52e6ea1adc6f271fd1","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"e3e6238baf917a025bcbff8be9288393","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"}
{"author_id":"e1a361a13f6595628524b87b6fa29918","result_id":"50|dedup_wf_001::e9457bd83cfd425b8779f239c96e0ffe"}
{"author_id":"5764f46e7ded9260eadea13e81fdf0fe","result_id":"50|dedup_wf_001::e9457bd83cfd425b8779f239c96e0ffe"}
{"author_id":"b56a640d36a2dc9e3dc88401edb61149","result_id":"50|dedup_wf_001::e9457bd83cfd425b8779f239c96e0ffe"}
{"author_id":"e08632d458b519b66e575dd5b7eb54e9","result_id":"50|dedup_wf_001::e9457bd83cfd425b8779f239c96e0ffe"}

View File

@ -0,0 +1,5 @@
{"id":"3ff0ab5e679c5320381c857d8699cd4a","pid":"10.5281/zenodo.2657248","result_id":"50|doi_dedup___::84db353272d83833fa76ec87fc540e63","type":"doi"}
{"id":"935716d050a36d36f797e843187b8192","pid":"https://hdl.handle.net/21.11115/0000-000e-0ff1-2","result_id":"50|r369162d0a40::da892118ba0be7a5cf695ad54ae5147e","type":"handle"}
{"id":"133b9dd1a59099adc577004209e83c52","pid":"21.11115/0000-000e-0ff1-2","result_id":"50|r369162d0a40::da892118ba0be7a5cf695ad54ae5147e","type":"handle"}
{"id":"8e17b86e61db6c34ec741eabe947ea9f","pid":"https://hdl.handle.net/21.11115/0000-000e-ce31-3","result_id":"50|r369162d0a40::b69a5145a8e41bdaa33c24be67c209f1","type":"handle"}
{"id":"b7cc730f4cbb6d379d5c4f57369978b3","pid":"21.11115/0000-000e-ce31-3","result_id":"50|r369162d0a40::b69a5145a8e41bdaa33c24be67c209f1","type":"handle"}

View File

@ -0,0 +1,4 @@
{"fullname":"Giovanni Aloisio","id":"5ac035663df4d9099cf92d0e3f22a964","orcid":""}
{"fullname":"Cosimo Palazzo","id":"9f0d3123b6390dd7b2f3cee66c6bc926","orcid":""}
{"firstname":"L","fullname":"L, Issel-Tarver","id":"bafb7637b5f1c692419e55b13bf719a3","lastname":"Issel-Tarver","orcid":""}
{"firstname":"Voula","fullname":"Giouli, Voula","id":"c80f55a9afb32ffc4bc6bb67b6e0df33","lastname":"Giouli","orcid":""}

View File

@ -0,0 +1,4 @@
{"accessright":"OPEN","country":"","description":"We describe the CoNLL-2002 shared task: language-independent named entity recognition. We give background information on the data sets and the evaluation method, present a general overview of the systems that have taken part in the task and discuss their performance.","id":"50|doi_dedup___::13b14c741a7b3420591c161f54ed5c80","keywords":"computer science - computation and language, i.2.7, computation and language (cs.cl), fos: computer and information sciences","language":"eng","publication_date":"2002-09-05","publisher":"","title":"Introduction to the CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition","type":"publication"}
{"accessright":"OPEN","country":"GB","description":"Following a strategy similar to that used in baker's yeast (Herrgård et al. Nat Biotechnol 26:1155-1160, 2008). A consensus yeast metabolic network obtained from a community approach to systems biology (Herrgård et al. 2008; Dobson et al. BMC Syst Biol 4:145, 2010). Further developments towards a genome-scale metabolic model of yeast (Dobson et al. 2010; Heavner et al. BMC Syst Biol 6:55, 2012). Yeast 5-an expanded reconstruction of the Saccharomyces cerevisiae metabolic network (Heavner et al. 2012) and in Salmonella typhimurium (Thiele et al. BMC Syst Biol 5:8, 2011). A community effort towards a knowledge-base and mathematical model of the human pathogen Salmonellatyphimurium LT2 (Thiele et al. 2011), a recent paper (Thiele et al. Nat Biotechnol 31:419-425, 2013). A community-driven global reconstruction of human metabolism (Thiele et al. 2013) described a much improved 'community consensus' reconstruction of the human metabolic network, called Recon 2, and the authors (that include the present ones) have made it freely available via a database at http://humanmetabolism.org/ and in SBML format at Biomodels (http://identifiers.org/biomodels.db/MODEL1109130000. This short analysis summarises the main findings, and suggests some approaches that will be able to exploit the availability of this model to advantage. © 2013 The Author(s).","id":"50|doi_dedup___::e0392f427fea9a701aa469e6f24bdf93","keywords":"review article, metabolism, modelling, systems biology, networks, metabolic networks, clinical biochemistry, biochemistry, endocrinology, diabetes and metabolism, community approach, operations research, metabolic network, human metabolism, metabolic model, biology, computational biology, sbml, 03 medical and health sciences, 0302 clinical medicine, 0303 health sciences, 030220 oncology & carcinogenesis, 030304 developmental biology, researchinstitutes_networks_beacons/manchester_institute_of_biotechnology, manchester institute of biotechnology","language":"eng","publication_date":"2013-08-01","publisher":"Springer US","title":"An analysis of a community-driven reconstruction of the human metabolic network","type":"publication"}
{"accessright":"OPEN","country":"","description":"Current machine learning systems operate, almost exclusively, in a statistical, or model-free mode, which entails severe theoretical limits on their power and performance. Such systems cannot reason about interventions and retrospection and, therefore, cannot serve as the basis for strong AI. To achieve human level intelligence, learning machines need the guidance of a model of reality, similar to the ones used in causal inference tasks. To demonstrate the essential role of such models, I will present a summary of seven tasks which are beyond reach of current machine learning systems and which have been accomplished using the tools of causal modeling.","id":"50|doi_dedup___::2436e90941a664931b54b956ade5b77b","keywords":"machine learning (cs.lg), artificial intelligence (cs.ai), machine learning (stat.ml), fos: computer and information sciences, mode (statistics), causal inference, artificial intelligence, business.industry, business, power (physics), computer science, machine learning, computer.software_genre, computer, basis (linear algebra), 03 medical and health sciences, 02 engineering and technology, 0202 electrical engineering, electronic engineering, information engineering, 0301 basic medicine, 020201 artificial intelligence & image processing, 030104 developmental biology, computer science - learning, computer science - artificial intelligence, statistics - machine learning","language":"und","publication_date":"2018-02-02","publisher":"arXiv","title":"Theoretical Impediments to Machine Learning With Seven Sparks from the Causal Revolution","type":"publication"}
{"accessright":"OPEN","country":"","description":"In most natural and engineered systems, a set of entities interact with each other in complicated patterns that can encompass multiple types of relationships, change in time, and include other types of complications. Such systems include multiple subsystems and layers of connectivity, and it is important to take such \"multilayer\" features into account to try to improve our understanding of complex systems. Consequently, it is necessary to generalize \"traditional\" network theory by developing (and validating) a framework and associated tools to study multilayer systems in a comprehensive fashion. The origins of such efforts date back several decades and arose in multiple disciplines, and now the study of multilayer networks has become one of the most important directions in network science. In this paper, we discuss the history of multilayer networks (and related concepts) and review the exploding body of work on such networks. To unify the disparate terminology in the large body of recent work, we discuss a general framework for multilayer networks, construct a dictionary of terminology to relate the numerous existing concepts to each other, and provide a thorough discussion that compares, contrasts, and translates between related notions such as multilayer networks, multiplex networks, interdependent networks, networks of networks, and many others. We also survey and discuss existing data sets that can be represented as multilayer networks. We review attempts to generalize single-layer-network diagnostics to multilayer networks. We also discuss the rapidly expanding research on multilayer-network models and notions like community structure, connected components, tensor decompositions, and various types of dynamical processes on multilayer networks. We conclude with a summary and an outlook.","id":"50|doi_dedup___::c5a574592f2e347f27be49d2c20a5558","keywords":"applied mathematics, computational mathematics, control and optimization, management science and operations research, computer networks and communications, data science, connected component, terminology, complex system, network theory, network science, construct (philosophy), computer science, interdependent networks, set (psychology), 01 natural sciences, 0103 physical sciences, 010306 general physics, 010305 fluids & plasmas, physics - physics and society, computer science - social and information networks, physics and society (physics.soc-ph), social and information networks (cs.si), fos: physical sciences, fos: computer and information sciences","language":"und","publication_date":"2013-09-27","publisher":"Oxford University Press (OUP)","title":"Multilayer networks","type":"publication"}

View File

@ -0,0 +1,17 @@
{"author_id":"6fa85e5d3da0c5ed3ab65e4423481714","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"dad3b6e22750b26a27296cd1c98565d1","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"121d8003d3895905cfd67b9b69ac99e1","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"91d3d8c07152d64fbf1c059940211334","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"a25d1cc688c34c0458a4b00b48bc4cdc","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"968ad30220675afb7a0b2b583b35c3a1","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"a55af296962dfb58977aabcb3cf6a8d9","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"5a344a09dab274779fd8e34654fd3541","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"77104c891595df750391d710280da022","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"}
{"author_id":"148f572c63c1f22386c1cae02e5bae2d","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"}
{"author_id":"8e571c27bc66cf96051302db9aa903dc","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"}
{"author_id":"175e45bf98e2b74df9c888598bb917fc","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"}
{"author_id":"bcdeabeece29231977e580b8f417ea82","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"}
{"author_id":"11cea0826b37ff58aa2f4c12ec42695e","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"}
{"author_id":"faf54def0161659b903f58ab4ce8bfae","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"}
{"author_id":"088daddc0f62bc2b8700a4e66a399d5f","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"}
{"author_id":"0b78df096d451535b5b8f7f4f47a6433","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"}

View File

@ -0,0 +1,12 @@
{"id":"94c1431ed983f9ea9996650e2d2205cc","pid":"10.5281/zenodo.3529160","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"doi"}
{"id":"f2328b2e830ee5c03945f65ab1802af7","pid":"10.3389/fphar.2019.01303","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"doi"}
{"id":"53511fa534223420fb925c58051725d6","pid":"31749705","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"pmid"}
{"id":"0e254059fe10cf07df8dbae2cfe5797e","pid":"pmc6848277","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"pmc"}
{"id":"a6181896a32edebf1c45649b894e5886","pid":"10.5281/zenodo.3529159","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"doi"}
{"id":"6e2dc8a4fd3523656a5abd3c0e090a18","pid":"10.7287/peerj.preprints.2711v2","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"doi"}
{"id":"2072bbca91cb3f3a05b2454edce57f6f","pid":"10.1371/journal.pbio.1002614","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"doi"}
{"id":"a4e63567711400f9526cc46ca84d2bc1","pid":"pmc5655613","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"pmc"}
{"id":"477cabc52ec11dfaec8631ee1073376d","pid":"29065148","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"pmid"}
{"id":"27285b8c2487b534fc2196d27ad4cf0d","pid":"10.7287/peerj.preprints.2711v3","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"doi"}
{"id":"056a211b8f85fe3058825df170960c06","pid":"10.1111/cgf.13610","result_id":"50|doi_dedup___::32c3649d7aa266f3d754463d6194ebd5","type":"doi"}
{"id":"79c575556941fbb62d9eee77b97fd0e4","pid":"1902.06815","result_id":"50|doi_dedup___::32c3649d7aa266f3d754463d6194ebd5","type":"arxiv"}

View File

@ -0,0 +1,2 @@
{"firstname":"Maurizio","fullname":"Toscano, Maurizio","id":"045bdce3ee24842af4eb4a7f89a44adb","lastname":"Toscano","orcid":""}
{"firstname":"","fullname":"Aitor Díaz","id":"25fc898122164b69f56f08a8545804d3","lastname":"","orcid":""}

View File

@ -0,0 +1 @@
{"accessright":"OPEN","country":"","description":"<p>Mapping digital humanities in Spain (1993-2019)</p> <p>This dataset has been&nbsp;extensively analysed in the following paper&nbsp;<a href=\"https://doi.org/10.3145/epi.2020.nov.01\">https://doi.org/10.3145/epi.2020.nov.01</a>&nbsp;and has also been used for the following poster&nbsp;<a href=\"https://doi.org/10.5281/zenodo.4256689\">https://doi.org/10.5281/zenodo.4256689</a></p>","id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423","keywords":"","language":"esl/spa","publication_date":"2020-06-14","publisher":"Zenodo","title":"Mapping digital humanities in Spain - 1993-2019","type":"software"}

View File

@ -0,0 +1,2 @@
{"author_id":"045bdce3ee24842af4eb4a7f89a44adb","result_id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423"}
{"author_id":"25fc898122164b69f56f08a8545804d3","result_id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423"}

View File

@ -0,0 +1,2 @@
{"id":"cb7d0c2e4660c784cb647060974dbee7","pid":"10.5281/zenodo.3893545","result_id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423","type":"doi"}
{"id":"19703b43918fc184698f6e0298bf2fc8","pid":"10.5281/zenodo.3893546","result_id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423","type":"doi"}