From d0f144d422fdccbd689da31ea2a17a98c46ea48c Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 11 May 2023 16:44:54 +0200 Subject: [PATCH 01/19] first implementation for the dump in csv of the subset of the graph related to specific communities. The only relations considered are the cites. the source must be within the set of communties, the target con be outside => we also have to map nodes not related to the communities of interest. These communities are given as parameter --- .../oa/graph/dump/QueryInformationSystem.java | 27 ++ .../dhp/oa/graph/dump/csv/AuthorResult.java | 93 ++++++ .../dhp/oa/graph/dump/csv/Constats.java | 11 + .../oa/graph/dump/csv/DumpCommunities.java | 108 +++++++ .../oa/graph/dump/csv/SparkDumpResults.java | 288 ++++++++++++++++++ .../graph/dump/csv/SparkMoveOnSigleDir.java | 109 +++++++ .../SparkSelectResultsAndDumpRelations.java | 181 +++++++++++ .../oa/graph/dump/csv/model/CSVAuthor.java | 57 ++++ .../oa/graph/dump/csv/model/CSVCitation.java | 37 +++ .../dhp/oa/graph/dump/csv/model/CSVPid.java | 51 ++++ .../dump/csv/model/CSVRELCommunityResult.java | 28 ++ .../oa/graph/dump/csv/model/CSVRelResAut.java | 28 ++ .../oa/graph/dump/csv/model/CSVResult.java | 108 +++++++ .../dump/csv/oozie_app/config-default.xml | 30 ++ .../oa/graph/dump/csv/oozie_app/workflow.xml | 262 ++++++++++++++++ .../oa/graph/dump/xqueries/all_communities.xq | 8 + .../graph/dump/xqueries/set_of_communities.xq | 11 + .../graph/dump/xqueries/single_community.xq | 8 + 18 files changed, 1445 insertions(+) create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constats.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunities.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkMoveOnSigleDir.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/config-default.xml create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/all_communities.xq create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/single_community.xq diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java index b972de6..60f1951 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java @@ -1,12 +1,18 @@ package eu.dnetlib.dhp.oa.graph.dump; +import java.io.BufferedWriter; import java.io.StringReader; +import java.util.ArrayList; import java.util.List; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constats; +import eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities; +import eu.dnetlib.dhp.utils.DHPUtils; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; +import org.dom4j.Node; import org.dom4j.io.SAXReader; import org.xml.sax.SAXException; @@ -71,4 +77,25 @@ public class QueryInformationSystem { return map; } + public List getCommunityCsv(String toString) throws ISLookUpException, SAXException, DocumentException { + List communities = new ArrayList<>(); + + for (String xml : isLookUp.quickSearchProfile(toString)) { + final Document doc; + final SAXReader reader = new SAXReader(); + reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + doc = reader.read(new StringReader(xml)); + Element root = doc.getRootElement(); + StringBuilder builder = new StringBuilder(); + builder.append(DHPUtils.md5(root.attribute("id").getValue())); + builder.append(Constats.SEP); + builder.append(root.attribute("label").getValue()); + builder.append(Constats.SEP); + builder.append(root.attribute("id").getValue()); + builder.append(Constats.SEP); + builder.append(((Node) (root .selectNodes("/description").get(0))).getText()); + communities.add(builder.toString()); + } + return communities; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java new file mode 100644 index 0000000..11a2f0e --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java @@ -0,0 +1,93 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv; + +import eu.dnetlib.dhp.utils.DHPUtils; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 05/05/23 + */ +public class AuthorResult implements Serializable { + private String authorId; + private String firstName; + private String lastName; + private String fullName; + private String orcid; + private String resultId; + private String rank; + + + public String getFullName() { + return fullName; + } + + public void setFullName(String fullName) { + this.fullName = fullName; + } + + public String getAuthorId() { + return authorId; + } + + public void setAuthorId(String authorId) { + this.authorId = authorId; + } + + public String getResultId() { + return resultId; + } + + public void setResultId(String resultId) { + this.resultId = resultId; + } + + public String getRank() { + return rank; + } + + public void setRank(String rank) { + this.rank = rank; + } + + public String getId() { + return authorId; + } + + public void setId(String id) { + this.authorId = id; + } + + public String getFirstName() { + return firstName; + } + + public void setFirstName(String firstName) { + this.firstName = firstName; + } + + public String getLastName() { + return lastName; + } + + public void setLastName(String lastName) { + this.lastName = lastName; + } + + public String getOrcid() { + return orcid; + } + + public void setOrcid(String orcid) { + this.orcid = orcid; + } + + public void autosetId() { + if(orcid != null){ + authorId = DHPUtils.md5(orcid); + }else{ + authorId = DHPUtils.md5(resultId + firstName + lastName + rank); + } + + } +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constats.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constats.java new file mode 100644 index 0000000..cb9d5b0 --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constats.java @@ -0,0 +1,11 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 10/05/23 + */ +public class Constats implements Serializable { + public final static String SEP = "\t"; +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunities.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunities.java new file mode 100644 index 0000000..94111e6 --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunities.java @@ -0,0 +1,108 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem; +import eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +/** + * @author miriam.baglioni + * @Date 09/05/23 + */ +public class DumpCommunities implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(DumpCommunities.class); + private final BufferedWriter writer; + + private final transient QueryInformationSystem queryInformationSystem; + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + DumpCommunities.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step3.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String workingPath = parser.get("workingPath"); + + final String nameNode = parser.get("nameNode"); + log.info("nameNode: {}", nameNode); + + + final DumpCommunities dc = new DumpCommunities(outputPath, nameNode, parser.get("isLookUp)")); + + dc.writeCommunity(); + + } + + private void writeCommunity() throws IOException, ISLookUpException, DocumentException, SAXException { + for(String community : queryInformationSystem.getCommunityCsv(IOUtils.toString( + DumpCommunities.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq")))) + { + writer + .write( + community); + writer.write("\n"); + + } + writer.close(); + } + + + public DumpCommunities(String hdfsPath, String hdfsNameNode, String isLookUpUrl) throws Exception { + final Configuration conf = new Configuration(); + queryInformationSystem= new QueryInformationSystem(); + queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl)); + + conf.set("fs.defaultFS", hdfsNameNode); + FileSystem fileSystem = FileSystem.get(conf); + Path hdfsWritePath = new Path(hdfsPath); + + if (fileSystem.exists(hdfsWritePath)) { + fileSystem.delete(hdfsWritePath, true); + } + FSDataOutputStream fos = fileSystem.create(hdfsWritePath); + + writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8)); + + + + } + + + +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java new file mode 100644 index 0000000..4270e1b --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java @@ -0,0 +1,288 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.Dataset; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +/** + * @author miriam.baglioni + * @Date 04/05/23 + */ +public class SparkDumpResults implements Serializable { + + + + private static final Logger log = LoggerFactory.getLogger(SparkDumpResults.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkDumpResults.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step2.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String resultType = parser.get("resultType"); + log.info("resultType: {}", resultType); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final String workingPath = parser.get("workingPath"); + + Class inputClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath ); + run(spark, inputPath, outputPath, inputClazz, resultType, workingPath); + + }); + + } + + private static void run(SparkSession spark, String inputPath, String outputPath, + Class inputClazz, String resultType, String workingPath) { + + Dataset resultIds = spark.read().textFile(workingPath + "/resultIds"); + Dataset results = Utils + .readPath(spark, inputPath + "/" + resultType, inputClazz) + .filter((FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible()); + + // map results + resultIds.joinWith(results, resultIds.col("value").equalTo(results.col("id"))) + .map((MapFunction, CSVResult>) t2 -> mapResultInfo(t2._2()), Encoders.bean(CSVResult.class) ) + .write() + .option("compression","gzip") + .mode(SaveMode.Overwrite) + .csv(workingPath + "/" + resultType + "/result"); + + + // map relations between pid and result + resultIds.joinWith(results, resultIds.col("value").equalTo(results.col("id"))) + .flatMap((FlatMapFunction, CSVPid>) t2 -> + { + List pids = new ArrayList<>(); + if(Optional.ofNullable(t2._2().getPid()).isPresent() && t2._2().getPid().size() > 0){ + pids.addAll(mapPid(t2._2().getPid(), t2._1())); + } + return pids.iterator(); + }, Encoders.bean(CSVPid.class)) + .filter(Objects::nonNull) + .write() + .option("compression","gzip") + .mode(SaveMode.Overwrite) + .csv(workingPath + "/" + resultType + "/result_pid"); + + + //map authors from the result + //per ogni autore nel result + //se l'autore ha un orcid il suo id dipende dall'orcid (tipo md5(orcid)) + //se non ha orcid il suo id si costruisce come result_id + author_name + authorrank ( se non ha il rank si sua + //la sua posizione nell'insieme degli autori) sempre con md5 + Dataset authorResult = resultIds.joinWith(results, resultIds.col("value").equalTo(results.col("id"))) + .flatMap((FlatMapFunction, AuthorResult>) t2 -> + { + int count = 0; + List arl = new ArrayList<>(); + for (Author a : t2._2().getAuthor()) { + count += 1; + AuthorResult ar = new AuthorResult(); + ar.setResultId(t2._1()); + if (Optional.ofNullable(a.getRank()).isPresent()) { + if (a.getRank() > 0) { + ar.setRank(String.valueOf(a.getRank())); + } else { + ar.setRank(String.valueOf(count)); + } + } + ar.setFirstName(a.getName()); + ar.setLastName(a.getSurname()); + ar.setFullName(a.getFullname()); + ar.setOrcid(getOrcid(a.getPid())); + ar.autosetId(); + arl.add(ar); + } + return arl.iterator(); + } + , Encoders.bean(AuthorResult.class)); + + // map the relation between author and result + authorResult.map((MapFunction) ar -> ar.getResultId() + Constats.SEP + ar.getAuthorId(), Encoders.STRING() ) + .write() + .option("compression","gzip") + .mode(SaveMode.Overwrite) + .csv(workingPath + "/" + resultType + "/result_author"); + + + // ma the authors in the working dir. I do not want to have them repeated + authorResult.groupByKey((MapFunction) ar -> ar.getAuthorId(), Encoders.STRING() ) + .mapGroups((MapGroupsFunction) (k, it) -> getAuthorDump(it.next()) , Encoders.bean(CSVAuthor.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .csv(workingPath + "/" + resultType + "/author"); + + } + + private static List mapPid(List pid, String resultId) { + return pid.stream().map(p -> { + CSVPid ret = new CSVPid(); + + ret.setId(DHPUtils.md5(p.getQualifier().getClassid() + p.getValue())); + ret.setResult_id(resultId); + ret.setPid(p.getValue()); + ret.setType(p.getQualifier().getClassid()); + + return ret; + }).collect(Collectors.toList()); + + } + + private static CSVAuthor getAuthorDump(AuthorResult ar) { + CSVAuthor ret = new CSVAuthor(); + ret.setFirstname(ar.getFirstName()); + + ret.setId(ar.getAuthorId()); + ret.setLastname(ar.getLastName()); + + ret.setFullname(ar.getFullName()); + + if(ar.getOrcid() != null){ + ret.setOrcid(ar.getOrcid()); + }else{ + ret.setOrcid(""); + } + + return ret; + } + + private static String getOrcid(List pid) { + if(!Optional.ofNullable(pid).isPresent()) + return null; + if(pid.size() == 0) + return null; + for(StructuredProperty p : pid){ + if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)){ + return p.getValue(); + } + } + return null; + } + + + + private static CSVResult mapResultInfo(R r) { + CSVResult ret = new CSVResult(); + ret.setId(r.getId()); + ret.setType(r.getResulttype().getClassid()); + ret.setTitle(getTitle(r.getTitle())); + ret.setDescription(getAbstract(r.getDescription())); + ret.setAccessright(r.getBestaccessright().getClassid()); + ret.setPublication_date(r.getDateofacceptance().getValue()); + if (StringUtils.isNotEmpty(r.getPublisher().getValue())) { + ret.setPublisher(r.getPublisher().getValue()); + } else { + ret.setPublisher(""); + } + + StringBuilder sbjs = new StringBuilder(); + for(StructuredProperty sbj : r.getSubject()){ + if(StringUtils.isNotEmpty(sbj.getValue())){ + sbjs.append(sbj.getValue()); + sbjs.append(","); + } + } + ret.setKeywords(sbjs.toString()); + + StringBuilder countries = new StringBuilder(); + + for(Country c: r.getCountry()){ + if(StringUtils.isNotEmpty(c.getClassid())){ + countries.append(c.getClassid()); + } + } + ret.setCountry(countries.toString()); + + if(StringUtils.isNotEmpty(r.getLanguage().getClassid())){ + ret.setLanguage(r.getLanguage().getClassid()); + }else{ + ret.setLanguage(""); + } + + return ret; + } + + private static String getAbstract(List> description) { + for(Field abs:description){ + if(StringUtils.isNotEmpty(abs.getValue())){ + return abs.getValue(); + } + } + return ""; + } + + + private static String getTitle(List titles) { + String firstTitle = null; + for(StructuredProperty title : titles){ + if(StringUtils.isEmpty(firstTitle)){ + if(StringUtils.isNotEmpty(title.getValue())) + firstTitle = title.getValue(); + } + if(title.getQualifier().getClassid().equals(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid())){ + if(StringUtils.isNotEmpty(title.getValue())) + return title.getValue(); + } + } + return ""; + } + + + + +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkMoveOnSigleDir.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkMoveOnSigleDir.java new file mode 100644 index 0000000..48b3a22 --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkMoveOnSigleDir.java @@ -0,0 +1,109 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +/** + * @author miriam.baglioni + * @Date 10/05/23 + */ +public class SparkMoveOnSigleDir implements Serializable { + + //All the products saved in different directories are put under the same one. + // For the authors also a step of reconciliation mast be done, since the same author id can be saved in more that one directory + + private static final Logger log = LoggerFactory.getLogger(SparkMoveOnSigleDir.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkMoveOnSigleDir.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step2.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath ); + run(spark, outputPath, workingPath); + + }); + + } + + private static void run(SparkSession spark, String outputPath, + String workingPath) { + + spark.read().textFile(workingPath + "/publication/result", workingPath + "/dataset/result", workingPath + "/software/result", workingPath + "/otherresearchproduct/result") + .write() + .mode(SaveMode.Overwrite) + .csv(outputPath + "/result"); + + spark.read().textFile(workingPath + "/publication/result_pid", workingPath + "/dataset/result_pid", workingPath + "/software/result_pid", workingPath + "/otherresearchproduct/result_pid") + .write() + .mode(SaveMode.Overwrite) + .csv(outputPath + "/result_pid"); + + + spark.read().textFile(workingPath + "/publication/result_author", workingPath + "/dataset/result_author", workingPath + "/software/result_author", workingPath + "/otherresearchproduct/result_author") + .write() + .mode(SaveMode.Overwrite) + .csv(outputPath + "/result_author"); + + + spark.read().textFile(workingPath + "/publication/result_author", workingPath + "/dataset/result_author", workingPath + "/software/result_author", workingPath + "/otherresearchproduct/result_author") + .groupByKey((MapFunction) a -> a.split("\t")[0], Encoders.STRING()) + .mapGroups((MapGroupsFunction) (k, it) -> it.next(), Encoders.STRING() ) + .write() + .mode(SaveMode.Overwrite) + .csv(outputPath + "/author"); + + + } + + +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java new file mode 100644 index 0000000..c6c554f --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java @@ -0,0 +1,181 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVCitation; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +/** + * @author miriam.baglioni + * @Date 04/05/23 + */ +public class SparkSelectResultsAndDumpRelations implements Serializable { + + + private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkSelectResultsAndDumpRelations.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step1.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("isLoo"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String workingPath = parser.get("workingPath"); + + List communityList = null; + Optional communities = Optional.ofNullable(parser.get("communities")); + if(communities.isPresent()){ + communityList = Arrays.asList(communities.get().split(";")); + } + + SparkConf conf = new SparkConf(); + + List finalCommunityList = communityList; + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath ); + run(spark, inputPath, outputPath, workingPath, finalCommunityList); + + }); + + } + + private static void run(SparkSession spark, String inputPath, String outputPath, String workingPath, + List communityList) { + + //select the result ids related to the set of communities considered + writeCommunityRelatedIds(spark, inputPath + "/publication", Publication.class, communityList, workingPath + "/communityResultIds"); + writeCommunityRelatedIds(spark, inputPath + "/dataset", Dataset.class, communityList, workingPath + "/communityResultIds"); + writeCommunityRelatedIds(spark, inputPath + "/software", Software.class, communityList, workingPath + "/communityResultIds" ); + writeCommunityRelatedIds(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList, workingPath + "/communityResultIds"); + + //write the relations result communities + writeCommunityResultRelations(spark, inputPath + "/publication", Publication.class, communityList, workingPath + "/communityResultIds"); + writeCommunityResultRelations(spark, inputPath + "/dataset", Dataset.class, communityList, workingPath + "/communityResultIds"); + writeCommunityResultRelations(spark, inputPath + "/software", Software.class, communityList, workingPath + "/communityResultIds" ); + writeCommunityResultRelations(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList, workingPath + "/communityResultIds"); + + //select the relations with semantics cites + org.apache.spark.sql.Dataset relations = Utils.readPath(spark, inputPath + "/relation", Relation.class) + .filter((FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.CITES)); + + //select the results target of the selected relations having as source one of the results related to the communities + org.apache.spark.sql.Dataset resultIds = spark.read().textFile(outputPath + "/communityResultIds").distinct(); + resultIds.joinWith(relations, resultIds.col("value").equalTo(relations.col("source")), "left") + .flatMap((FlatMapFunction, String>) t2 -> { + if(Optional.ofNullable(t2._2()).isPresent()){ + return Arrays.asList(t2._1(), t2._2().getTarget()).iterator(); + }else{ + return Arrays.asList(t2._1()).iterator(); + } + }, Encoders.STRING()) + .distinct() + .write() + .mode(SaveMode.Overwrite) + .option("compression" , "gzip") + .text(workingPath + "/resultIds"); + + resultIds.joinWith(relations, resultIds.col("value").equalTo(relations.col("source"))) + .map((MapFunction, CSVCitation>) t2 -> mapToCitation(t2._2()), Encoders.bean(CSVCitation.class) ) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .csv(outputPath + "/relation"); + + + } + + private static CSVCitation mapToCitation(Relation relation) { + CSVCitation ret = new CSVCitation(); + ret.setId(DHPUtils.md5(relation.getSource() + relation.getRelClass().toLowerCase() + relation.getTarget())); + ret.setResult_id_cites(relation.getSource()); + ret.setResult_id_cited(relation.getTarget()); + return ret; + } + + private static void writeCommunityResultRelations(SparkSession spark, String inputPath, Class clazz, List communityList, String outputPath) { + Utils + .readPath(spark, inputPath , clazz) + .filter((FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && + !p.getDataInfo().getInvisible() ) + .flatMap((FlatMapFunction) p-> { + List ret = new ArrayList<>(); + for(Context context : p.getContext()){ + String cId = context.getId().contains("::") ? context.getId().substring(0, context.getId().indexOf("::")) : context.getId(); + if (communityList.contains(cId)){ + CSVRELCommunityResult crc = new CSVRELCommunityResult(); + crc.setResult_id(p.getId()); + crc.setCommunity_id(DHPUtils.md5(cId)); + } + } + return ret.iterator(); + }, Encoders.bean(CSVRELCommunityResult.class) ) + .write() + .option("compression","gzip") + .mode(SaveMode.Append) + .text(outputPath ); + } + + + private static void writeCommunityRelatedIds(SparkSession spark, String inputPath, Class clazz, List communityList, String outputPath) { + Utils + .readPath(spark, inputPath , clazz) + .filter((FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && + !p.getDataInfo().getInvisible() && + isRelatedToCommunities(p, communityList)) + .map((MapFunction) p-> p.getId(), Encoders.STRING() ) + .write() + .option("compression","gzip") + .mode(SaveMode.Append) + .text(outputPath ); + } + + private static boolean isRelatedToCommunities(R p, List communityList) { + return p.getContext().stream().anyMatch(c -> communityList.contains(c.getId()) || + (c.getId().contains("::") && communityList.contains(c.getId().substring(0, c.getId().indexOf("::"))))); + } + + + +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java new file mode 100644 index 0000000..4311f8a --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java @@ -0,0 +1,57 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv.model; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class CSVAuthor implements Serializable { + private String id; + private String firstname; + private String lastname; + private String fullname; + private String orcid; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getFirstname() { + return firstname; + } + + public void setFirstname(String firstname) { + this.firstname = firstname; + } + + public String getLastname() { + return lastname; + } + + public void setLastname(String lastname) { + this.lastname = lastname; + } + + public String getFullname() { + return fullname; + } + + public void setFullname(String fullname) { + this.fullname = fullname; + } + + public String getOrcid() { + return orcid; + } + + public void setOrcid(String orcid) { + this.orcid = orcid; + } + + +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java new file mode 100644 index 0000000..95abab6 --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java @@ -0,0 +1,37 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv.model; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class CSVCitation implements Serializable { + private String id; + private String result_id_cites; + private String result_id_cited; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getResult_id_cites() { + return result_id_cites; + } + + public void setResult_id_cites(String result_id_cites) { + this.result_id_cites = result_id_cites; + } + + public String getResult_id_cited() { + return result_id_cited; + } + + public void setResult_id_cited(String result_id_cited) { + this.result_id_cited = result_id_cited; + } +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java new file mode 100644 index 0000000..57aedb8 --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java @@ -0,0 +1,51 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv.model; + +import eu.dnetlib.dhp.oa.graph.dump.csv.Constats; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.DHPUtils; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class CSVPid implements Serializable { + + private String id; + private String result_id; + private String pid; + private String type; + + public String getResult_id() { + return result_id; + } + + public void setResult_id(String result_id) { + this.result_id = result_id; + } + + public String getPid() { + return pid; + } + + public void setPid(String pid) { + this.pid = pid; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java new file mode 100644 index 0000000..3922f52 --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java @@ -0,0 +1,28 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv.model; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class CSVRELCommunityResult implements Serializable { + private String result_id; + private String community_id; + + public String getResult_id() { + return result_id; + } + + public void setResult_id(String result_id) { + this.result_id = result_id; + } + + public String getCommunity_id() { + return community_id; + } + + public void setCommunity_id(String community_id) { + this.community_id = community_id; + } +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java new file mode 100644 index 0000000..cbadf29 --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java @@ -0,0 +1,28 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv.model; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class CSVRelResAut implements Serializable { + private String result_id; + private String author_id; + + public String getResult_id() { + return result_id; + } + + public void setResult_id(String result_id) { + this.result_id = result_id; + } + + public String getAuthor_id() { + return author_id; + } + + public void setAuthor_id(String author_id) { + this.author_id = author_id; + } +} diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java new file mode 100644 index 0000000..1877537 --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java @@ -0,0 +1,108 @@ +package eu.dnetlib.dhp.oa.graph.dump.csv.model; + +import com.fasterxml.jackson.annotation.JsonGetter; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonSetter; +import eu.dnetlib.dhp.schema.oaf.Country; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import org.apache.commons.lang.StringUtils; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class CSVResult implements Serializable { + private String id; + private String type; + private String title; + private String description; + private String accessright; + private String publication_date; + private String publisher; + private String keywords; + private String country; + private String language; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getAccessright() { + return accessright; + } + + public void setAccessright(String accessright) { + this.accessright = accessright; + } + + public String getPublication_date() { + return publication_date; + } + + public void setPublication_date(String publication_date) { + this.publication_date = publication_date; + } + + public String getPublisher() { + return publisher; + } + + public void setPublisher(String publisher) { + this.publisher = publisher; + } + + public String getKeywords() { + return keywords; + } + + public void setKeywords(String keywords) { + this.keywords = keywords; + } + + public String getCountry() { + return country; + } + + public void setCountry(String country) { + this.country = country; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + +} diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/config-default.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/config-default.xml new file mode 100644 index 0000000..d262cb6 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire + + + oozie.launcher.mapreduce.user.classpath.first + true + + diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml new file mode 100644 index 0000000..1606d6e --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml @@ -0,0 +1,262 @@ + + + + sourcePath + the source path + + + outputPath + the output path + + + communities + the communities whose products should be dumped + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + + + + + + + + + yarn + cluster + select results from publication + eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir} + --communities${communities} + --resultTypepublication + + + + + + + yarn + cluster + select results from dataset + eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir} + --communities${communities} + --resultTypedataset + + + + + + + yarn + cluster + select results from other + eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir} + --communities${communities} + --resultTypeotherresearchproduct + + + + + + + yarn + cluster + select results from software + eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir} + --communities${communities} + --resultTypesoftware + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + yarn + cluster + Dump table project + eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sourcePath}/project + --resultTableNameeu.dnetlib.dhp.schema.oaf.Project + --outputPath${workingDir}/project + --communityMapPathnoneed + + + + + + + yarn + cluster + Dump table project + eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectsSubsetSparkJob + dump-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/project + --outputPath${workingDir}/tar/project + --projectListPath${projectListPath} + + + + + + + eu.dnetlib.dhp.oa.graph.dump.MakeTar + --hdfsPath${outputPath} + --nameNode${nameNode} + --sourcePath${workingDir}/tar + + + + + + + eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS + --hdfsPath${outputPath} + --nameNode${nameNode} + --accessToken${accessToken} + --connectionUrl${connectionUrl} + --metadata${metadata} + --conceptRecordId${conceptRecordId} + --depositionType${depositionType} + --depositionId${depositionId} + + + + + + \ No newline at end of file diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/all_communities.xq b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/all_communities.xq new file mode 100644 index 0000000..620955c --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/all_communities.xq @@ -0,0 +1,8 @@ +for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') +where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] +and ($x//context/param[./@name = 'status']/text() = 'all') +return + +{$x//CONFIGURATION/context/@id} +{$x//CONFIGURATION/context/@label} + \ No newline at end of file diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq new file mode 100644 index 0000000..7fad824 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq @@ -0,0 +1,11 @@ +for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') +where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] +and ($x//CONFIGURATION/context[./@id='dh-ch'] or $x//CONFIGURATION/context[./@id='dariah'] or $x//CONFIGURATION/context[./@id='enermaps'] or $x//CONFIGURATION/context[./@id='beopen']) +return + +{$x//CONFIGURATION/context/@id} +{$x//CONFIGURATION/context/@label} + +{$x//CONFIGURATION/context/param[@name='description']/text()} + + \ No newline at end of file diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/single_community.xq b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/single_community.xq new file mode 100644 index 0000000..4f257a6 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/single_community.xq @@ -0,0 +1,8 @@ +for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') +where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] +and $x//CONFIGURATION/context[./@id=%s] +return + +{$x//CONFIGURATION/context/@id} +{$x//CONFIGURATION/context/@label} + \ No newline at end of file From acb3c691bc396d1ffae078b5a6ffa19316b11b92 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 16 May 2023 14:04:44 +0200 Subject: [PATCH 02/19] [DUMP CSV] added query and method to get the information to dump in the CSV regarding the selected communities --- .../dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java index 60f1951..974ef7f 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java @@ -6,9 +6,6 @@ import java.io.StringReader; import java.util.ArrayList; import java.util.List; -import eu.dnetlib.dhp.oa.graph.dump.csv.Constats; -import eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities; -import eu.dnetlib.dhp.utils.DHPUtils; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; @@ -17,6 +14,9 @@ import org.dom4j.io.SAXReader; import org.xml.sax.SAXException; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constats; +import eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities; +import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -93,7 +93,7 @@ public class QueryInformationSystem { builder.append(Constats.SEP); builder.append(root.attribute("id").getValue()); builder.append(Constats.SEP); - builder.append(((Node) (root .selectNodes("/description").get(0))).getText()); + builder.append(((Node) (root.selectNodes("/description").get(0))).getText()); communities.add(builder.toString()); } return communities; From b9076f9aa8aa136c1af4d27c3e2be026585b2ba0 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 16 May 2023 14:06:25 +0200 Subject: [PATCH 03/19] [DUMP CSV] model classes to mirror the attributes of the tables to be dumped --- .../oa/graph/dump/csv/model/CSVAuthor.java | 72 ++++----- .../oa/graph/dump/csv/model/CSVCitation.java | 43 ++--- .../dhp/oa/graph/dump/csv/model/CSVPid.java | 61 +++---- .../dump/csv/model/CSVRELCommunityResult.java | 29 ++-- .../oa/graph/dump/csv/model/CSVRelResAut.java | 29 ++-- .../oa/graph/dump/csv/model/CSVResult.java | 149 +++++++++--------- .../oa/graph/dump/input_dump_csv_ste2.json | 38 +++++ 7 files changed, 233 insertions(+), 188 deletions(-) create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java index 4311f8a..a3188f4 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv.model; import java.io.Serializable; @@ -7,51 +8,50 @@ import java.io.Serializable; * @Date 11/05/23 */ public class CSVAuthor implements Serializable { - private String id; - private String firstname; - private String lastname; - private String fullname; - private String orcid; + private String id; + private String firstname; + private String lastname; + private String fullname; + private String orcid; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getFirstname() { - return firstname; - } + public String getFirstname() { + return firstname; + } - public void setFirstname(String firstname) { - this.firstname = firstname; - } + public void setFirstname(String firstname) { + this.firstname = firstname; + } - public String getLastname() { - return lastname; - } + public String getLastname() { + return lastname; + } - public void setLastname(String lastname) { - this.lastname = lastname; - } + public void setLastname(String lastname) { + this.lastname = lastname; + } - public String getFullname() { - return fullname; - } + public String getFullname() { + return fullname; + } - public void setFullname(String fullname) { - this.fullname = fullname; - } + public void setFullname(String fullname) { + this.fullname = fullname; + } - public String getOrcid() { - return orcid; - } - - public void setOrcid(String orcid) { - this.orcid = orcid; - } + public String getOrcid() { + return orcid; + } + public void setOrcid(String orcid) { + this.orcid = orcid; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java index 95abab6..d7b54e3 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv.model; import java.io.Serializable; @@ -7,31 +8,31 @@ import java.io.Serializable; * @Date 11/05/23 */ public class CSVCitation implements Serializable { - private String id; - private String result_id_cites; - private String result_id_cited; + private String id; + private String result_id_cites; + private String result_id_cited; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getResult_id_cites() { - return result_id_cites; - } + public String getResult_id_cites() { + return result_id_cites; + } - public void setResult_id_cites(String result_id_cites) { - this.result_id_cites = result_id_cites; - } + public void setResult_id_cites(String result_id_cites) { + this.result_id_cites = result_id_cites; + } - public String getResult_id_cited() { - return result_id_cited; - } + public String getResult_id_cited() { + return result_id_cited; + } - public void setResult_id_cited(String result_id_cited) { - this.result_id_cited = result_id_cited; - } + public void setResult_id_cited(String result_id_cited) { + this.result_id_cited = result_id_cited; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java index 57aedb8..d067d08 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java @@ -1,51 +1,52 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv.model; +import java.io.Serializable; + import eu.dnetlib.dhp.oa.graph.dump.csv.Constats; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; -import java.io.Serializable; - /** * @author miriam.baglioni * @Date 11/05/23 */ public class CSVPid implements Serializable { - private String id; - private String result_id; - private String pid; - private String type; + private String id; + private String result_id; + private String pid; + private String type; - public String getResult_id() { - return result_id; - } + public String getResult_id() { + return result_id; + } - public void setResult_id(String result_id) { - this.result_id = result_id; - } + public void setResult_id(String result_id) { + this.result_id = result_id; + } - public String getPid() { - return pid; - } + public String getPid() { + return pid; + } - public void setPid(String pid) { - this.pid = pid; - } + public void setPid(String pid) { + this.pid = pid; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java index 3922f52..bf81fce 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv.model; import java.io.Serializable; @@ -7,22 +8,22 @@ import java.io.Serializable; * @Date 11/05/23 */ public class CSVRELCommunityResult implements Serializable { - private String result_id; - private String community_id; + private String result_id; + private String community_id; - public String getResult_id() { - return result_id; - } + public String getResult_id() { + return result_id; + } - public void setResult_id(String result_id) { - this.result_id = result_id; - } + public void setResult_id(String result_id) { + this.result_id = result_id; + } - public String getCommunity_id() { - return community_id; - } + public String getCommunity_id() { + return community_id; + } - public void setCommunity_id(String community_id) { - this.community_id = community_id; - } + public void setCommunity_id(String community_id) { + this.community_id = community_id; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java index cbadf29..610668e 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv.model; import java.io.Serializable; @@ -7,22 +8,22 @@ import java.io.Serializable; * @Date 11/05/23 */ public class CSVRelResAut implements Serializable { - private String result_id; - private String author_id; + private String result_id; + private String author_id; - public String getResult_id() { - return result_id; - } + public String getResult_id() { + return result_id; + } - public void setResult_id(String result_id) { - this.result_id = result_id; - } + public void setResult_id(String result_id) { + this.result_id = result_id; + } - public String getAuthor_id() { - return author_id; - } + public String getAuthor_id() { + return author_id; + } - public void setAuthor_id(String author_id) { - this.author_id = author_id; - } + public void setAuthor_id(String author_id) { + this.author_id = author_id; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java index 1877537..1baee9b 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java @@ -1,108 +1,111 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv.model; +import java.io.Serializable; + +import org.apache.commons.lang.StringUtils; + import com.fasterxml.jackson.annotation.JsonGetter; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonSetter; + import eu.dnetlib.dhp.schema.oaf.Country; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import org.apache.commons.lang.StringUtils; - -import java.io.Serializable; /** * @author miriam.baglioni * @Date 11/05/23 */ public class CSVResult implements Serializable { - private String id; - private String type; - private String title; - private String description; - private String accessright; - private String publication_date; - private String publisher; - private String keywords; - private String country; - private String language; + private String id; + private String type; + private String title; + private String description; + private String accessright; + private String publication_date; + private String publisher; + private String keywords; + private String country; + private String language; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } - public String getTitle() { - return title; - } + public String getTitle() { + return title; + } - public void setTitle(String title) { - this.title = title; - } + public void setTitle(String title) { + this.title = title; + } - public String getDescription() { - return description; - } + public String getDescription() { + return description; + } - public void setDescription(String description) { - this.description = description; - } + public void setDescription(String description) { + this.description = description; + } - public String getAccessright() { - return accessright; - } + public String getAccessright() { + return accessright; + } - public void setAccessright(String accessright) { - this.accessright = accessright; - } + public void setAccessright(String accessright) { + this.accessright = accessright; + } - public String getPublication_date() { - return publication_date; - } + public String getPublication_date() { + return publication_date; + } - public void setPublication_date(String publication_date) { - this.publication_date = publication_date; - } + public void setPublication_date(String publication_date) { + this.publication_date = publication_date; + } - public String getPublisher() { - return publisher; - } + public String getPublisher() { + return publisher; + } - public void setPublisher(String publisher) { - this.publisher = publisher; - } + public void setPublisher(String publisher) { + this.publisher = publisher; + } - public String getKeywords() { - return keywords; - } + public String getKeywords() { + return keywords; + } - public void setKeywords(String keywords) { - this.keywords = keywords; - } + public void setKeywords(String keywords) { + this.keywords = keywords; + } - public String getCountry() { - return country; - } + public String getCountry() { + return country; + } - public void setCountry(String country) { - this.country = country; - } + public void setCountry(String country) { + this.country = country; + } - public String getLanguage() { - return language; - } + public String getLanguage() { + return language; + } - public void setLanguage(String language) { - this.language = language; - } + public void setLanguage(String language) { + this.language = language; + } } diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json new file mode 100644 index 0000000..a78b1be --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json @@ -0,0 +1,38 @@ +[ + + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName":"wp", + "paramLongName":"workingPath", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName":"c", + "paramLongName":"communities", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + } +] + + + + + From 636945a5c559c39aad4a4c05b8454d1dea195616 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 16 May 2023 14:09:21 +0200 Subject: [PATCH 04/19] [DUMP CSV] refactoring --- .../dhp/oa/graph/dump/QueryInformationSystem.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java index 974ef7f..29ccab4 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java @@ -1,7 +1,6 @@ package eu.dnetlib.dhp.oa.graph.dump; -import java.io.BufferedWriter; import java.io.StringReader; import java.util.ArrayList; import java.util.List; @@ -14,8 +13,7 @@ import org.dom4j.io.SAXReader; import org.xml.sax.SAXException; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; -import eu.dnetlib.dhp.oa.graph.dump.csv.Constats; -import eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -88,11 +86,11 @@ public class QueryInformationSystem { Element root = doc.getRootElement(); StringBuilder builder = new StringBuilder(); builder.append(DHPUtils.md5(root.attribute("id").getValue())); - builder.append(Constats.SEP); + builder.append(Constants.SEP); builder.append(root.attribute("label").getValue()); - builder.append(Constats.SEP); + builder.append(Constants.SEP); builder.append(root.attribute("id").getValue()); - builder.append(Constats.SEP); + builder.append(Constants.SEP); builder.append(((Node) (root.selectNodes("/description").get(0))).getText()); communities.add(builder.toString()); } From 44a256fc90f24bc33a840259abff6c6d4b20461b Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 16 May 2023 14:10:14 +0200 Subject: [PATCH 05/19] [DUMP CSV] refactoring --- .../java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java index d067d08..873e8f7 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java @@ -3,10 +3,6 @@ package eu.dnetlib.dhp.oa.graph.dump.csv.model; import java.io.Serializable; -import eu.dnetlib.dhp.oa.graph.dump.csv.Constats; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.dhp.utils.DHPUtils; - /** * @author miriam.baglioni * @Date 11/05/23 From 2ed76d46622e78a21c83fade7dfee0a6b1dc2544 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 16 May 2023 14:20:45 +0200 Subject: [PATCH 06/19] [DUMP CSV] tested file to execute the dump of the relations with semantcis Cites from nodes belonging to a selected communities. It also dumps the relationships result_communities and prepare the ground for the dump of the results. --- .../SparkSelectResultsAndDumpRelations.java | 318 ++++++++++-------- 1 file changed, 180 insertions(+), 138 deletions(-) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java index c6c554f..094264e 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java @@ -1,12 +1,15 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.dump.Utils; -import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVCitation; -import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.utils.DHPUtils; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; @@ -17,165 +20,204 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVCitation; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** * @author miriam.baglioni * @Date 04/05/23 */ +//STEP 2 public class SparkSelectResultsAndDumpRelations implements Serializable { + private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class); + private static String RESULT_COMMUNITY_TABLE = "/result_community"; + private static String COMMUNITY_RESULT_IDS = "/communityResultIds"; + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkSelectResultsAndDumpRelations.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json")); - private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - SparkSelectResultsAndDumpRelations.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step1.json")); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final String inputPath = parser.get("isLoo"); - log.info("inputPath: {}", inputPath); + final String workingPath = parser.get("workingPath"); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + List communityList = null; + Optional communities = Optional.ofNullable(parser.get("communities")); + if (communities.isPresent()) { + communityList = Arrays.asList(communities.get().split(";")); + } - final String workingPath = parser.get("workingPath"); + SparkConf conf = new SparkConf(); - List communityList = null; - Optional communities = Optional.ofNullable(parser.get("communities")); - if(communities.isPresent()){ - communityList = Arrays.asList(communities.get().split(";")); - } + List finalCommunityList = communityList; + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath); + run(spark, inputPath, outputPath, workingPath, finalCommunityList); - SparkConf conf = new SparkConf(); + }); - List finalCommunityList = communityList; - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath ); - run(spark, inputPath, outputPath, workingPath, finalCommunityList); + } - }); + private static void run(SparkSession spark, String inputPath, String outputPath, + String workingPath, + List communityList) { - } - - private static void run(SparkSession spark, String inputPath, String outputPath, String workingPath, - List communityList) { - - //select the result ids related to the set of communities considered - writeCommunityRelatedIds(spark, inputPath + "/publication", Publication.class, communityList, workingPath + "/communityResultIds"); - writeCommunityRelatedIds(spark, inputPath + "/dataset", Dataset.class, communityList, workingPath + "/communityResultIds"); - writeCommunityRelatedIds(spark, inputPath + "/software", Software.class, communityList, workingPath + "/communityResultIds" ); - writeCommunityRelatedIds(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList, workingPath + "/communityResultIds"); - - //write the relations result communities - writeCommunityResultRelations(spark, inputPath + "/publication", Publication.class, communityList, workingPath + "/communityResultIds"); - writeCommunityResultRelations(spark, inputPath + "/dataset", Dataset.class, communityList, workingPath + "/communityResultIds"); - writeCommunityResultRelations(spark, inputPath + "/software", Software.class, communityList, workingPath + "/communityResultIds" ); - writeCommunityResultRelations(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList, workingPath + "/communityResultIds"); - - //select the relations with semantics cites - org.apache.spark.sql.Dataset relations = Utils.readPath(spark, inputPath + "/relation", Relation.class) - .filter((FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - r.getRelClass().equals(ModelConstants.CITES)); - - //select the results target of the selected relations having as source one of the results related to the communities - org.apache.spark.sql.Dataset resultIds = spark.read().textFile(outputPath + "/communityResultIds").distinct(); - resultIds.joinWith(relations, resultIds.col("value").equalTo(relations.col("source")), "left") - .flatMap((FlatMapFunction, String>) t2 -> { - if(Optional.ofNullable(t2._2()).isPresent()){ - return Arrays.asList(t2._1(), t2._2().getTarget()).iterator(); - }else{ - return Arrays.asList(t2._1()).iterator(); - } - }, Encoders.STRING()) - .distinct() - .write() - .mode(SaveMode.Overwrite) - .option("compression" , "gzip") - .text(workingPath + "/resultIds"); - - resultIds.joinWith(relations, resultIds.col("value").equalTo(relations.col("source"))) - .map((MapFunction, CSVCitation>) t2 -> mapToCitation(t2._2()), Encoders.bean(CSVCitation.class) ) - .write() - .option("compression", "gzip") - .mode(SaveMode.Overwrite) - .csv(outputPath + "/relation"); + // select the result ids related to the set of communities considered + writeCommunityRelatedIds( + spark, inputPath + "/publication", Publication.class, communityList, workingPath + COMMUNITY_RESULT_IDS); + writeCommunityRelatedIds( + spark, inputPath + "/dataset", Dataset.class, communityList, workingPath + COMMUNITY_RESULT_IDS); + writeCommunityRelatedIds( + spark, inputPath + "/software", Software.class, communityList, workingPath + COMMUNITY_RESULT_IDS); + writeCommunityRelatedIds( + spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList, + workingPath + COMMUNITY_RESULT_IDS); - } + // write the relations result communities + writeCommunityResultRelations( + spark, inputPath + "/publication", Publication.class, communityList, outputPath + RESULT_COMMUNITY_TABLE); + writeCommunityResultRelations( + spark, inputPath + "/dataset", Dataset.class, communityList, outputPath + RESULT_COMMUNITY_TABLE); + writeCommunityResultRelations( + spark, inputPath + "/software", Software.class, communityList, outputPath + RESULT_COMMUNITY_TABLE); + writeCommunityResultRelations( + spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList, + outputPath + RESULT_COMMUNITY_TABLE); - private static CSVCitation mapToCitation(Relation relation) { - CSVCitation ret = new CSVCitation(); - ret.setId(DHPUtils.md5(relation.getSource() + relation.getRelClass().toLowerCase() + relation.getTarget())); - ret.setResult_id_cites(relation.getSource()); - ret.setResult_id_cited(relation.getTarget()); - return ret; - } + // select the relations with semantics cites + org.apache.spark.sql.Dataset relations = Utils + .readPath(spark, inputPath + "/relation", Relation.class) + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.CITES)); - private static void writeCommunityResultRelations(SparkSession spark, String inputPath, Class clazz, List communityList, String outputPath) { - Utils - .readPath(spark, inputPath , clazz) - .filter((FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && - !p.getDataInfo().getInvisible() ) - .flatMap((FlatMapFunction) p-> { - List ret = new ArrayList<>(); - for(Context context : p.getContext()){ - String cId = context.getId().contains("::") ? context.getId().substring(0, context.getId().indexOf("::")) : context.getId(); - if (communityList.contains(cId)){ - CSVRELCommunityResult crc = new CSVRELCommunityResult(); - crc.setResult_id(p.getId()); - crc.setCommunity_id(DHPUtils.md5(cId)); - } - } - return ret.iterator(); - }, Encoders.bean(CSVRELCommunityResult.class) ) - .write() - .option("compression","gzip") - .mode(SaveMode.Append) - .text(outputPath ); - } + // select the results target of the selected relations having as source one of the results related to the + // communities + org.apache.spark.sql.Dataset resultIds = spark + .read() + .textFile(workingPath + COMMUNITY_RESULT_IDS) + .distinct(); + resultIds + .joinWith(relations, resultIds.col("value").equalTo(relations.col("source")), "left") + .flatMap((FlatMapFunction, String>) t2 -> { + if (Optional.ofNullable(t2._2()).isPresent()) { + return Arrays.asList(t2._1(), t2._2().getTarget()).iterator(); + } else { + return Arrays.asList(t2._1()).iterator(); + } + }, Encoders.STRING()) + .distinct() + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") - private static void writeCommunityRelatedIds(SparkSession spark, String inputPath, Class clazz, List communityList, String outputPath) { - Utils - .readPath(spark, inputPath , clazz) - .filter((FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && - !p.getDataInfo().getInvisible() && - isRelatedToCommunities(p, communityList)) - .map((MapFunction) p-> p.getId(), Encoders.STRING() ) - .write() - .option("compression","gzip") - .mode(SaveMode.Append) - .text(outputPath ); - } + .text(workingPath + "/resultIds"); - private static boolean isRelatedToCommunities(R p, List communityList) { - return p.getContext().stream().anyMatch(c -> communityList.contains(c.getId()) || - (c.getId().contains("::") && communityList.contains(c.getId().substring(0, c.getId().indexOf("::"))))); - } + resultIds + .joinWith(relations, resultIds.col("value").equalTo(relations.col("source"))) + .map( + (MapFunction, CSVCitation>) t2 -> mapToCitation(t2._2()), + Encoders.bean(CSVCitation.class)) + .write() + .option("compression", "gzip") + .option("header","true") + .option("delimiter", Constants.SEP) + .mode(SaveMode.Overwrite) + .csv(outputPath + "/relation"); + } + private static CSVCitation mapToCitation(Relation relation) { + CSVCitation ret = new CSVCitation(); + ret.setId(DHPUtils.md5(relation.getSource() + relation.getRelClass().toLowerCase() + relation.getTarget())); + ret.setResult_id_cites(relation.getSource()); + ret.setResult_id_cited(relation.getTarget()); + return ret; + } + + private static void writeCommunityResultRelations(SparkSession spark, String inputPath, + Class clazz, List communityList, String outputPath) { + Utils + .readPath(spark, inputPath, clazz) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && + !p.getDataInfo().getInvisible()) + .flatMap((FlatMapFunction) p -> { + List ret = new ArrayList<>(); + + for (String context :p.getContext().stream().map(Context::getId).distinct().collect(Collectors.toList())) { + String cId = context.contains("::") + ? context.substring(0, context.indexOf("::")) + : context; + if (communityList.contains(cId)) { + CSVRELCommunityResult crc = new CSVRELCommunityResult(); + crc.setResult_id(p.getId()); + crc.setCommunity_id(DHPUtils.md5(cId)); + ret.add(crc); + } + } + return ret.iterator(); + }, Encoders.bean(CSVRELCommunityResult.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Append) + .option("header","true") + .option("delimiter",Constants.SEP) + .csv(outputPath); + } + + private static void writeCommunityRelatedIds(SparkSession spark, String inputPath, + Class clazz, List communityList, String outputPath) { + Utils + .readPath(spark, inputPath, clazz) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && + !p.getDataInfo().getInvisible() && + isRelatedToCommunities(p, communityList)) + .map((MapFunction) Result::getId, Encoders.STRING()) + .write() + .option("compression", "gzip") + .mode(SaveMode.Append) + .text(outputPath); + + } + + private static boolean isRelatedToCommunities(R p, List communityList) { + return p + .getContext() + .stream() + .anyMatch( + c -> communityList.contains(c.getId()) || + (c.getId().contains("::") + && communityList.contains(c.getId().substring(0, c.getId().indexOf("::"))))); + } } From f79c06209ef9ed53808ee7f671f4198448427726 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 16 May 2023 14:21:39 +0200 Subject: [PATCH 07/19] [DUMP CSV] test and resources for the SelectResultAndDumpRelation job --- .../csv/SelectResultAndDumpRelationTest.java | 160 ++++++++++++++++++ .../dhp/oa/graph/dump/csv/input/dataset | 0 .../graph/dump/csv/input/otherresearchproduct | 0 .../dhp/oa/graph/dump/csv/input/publication | 4 + .../dhp/oa/graph/dump/csv/input/relation | 4 + .../dhp/oa/graph/dump/csv/input/software | 0 6 files changed, 168 insertions(+) create mode 100644 dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/dataset create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/otherresearchproduct create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/relation create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/software diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java new file mode 100644 index 0000000..1891a67 --- /dev/null +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java @@ -0,0 +1,160 @@ + +package eu.dnetlib.dhp.oa.graph.dump.csv; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; + +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult; +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.ForeachFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.oa.graph.dump.complete.SelectRelationTest; +import eu.dnetlib.dhp.oa.graph.dump.complete.SparkSelectValidRelationsJob; +import eu.dnetlib.dhp.schema.oaf.Relation; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class SelectResultAndDumpRelationTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + private static final Logger log = LoggerFactory + .getLogger(SelectResultAndDumpRelationTest.class); + + private static HashMap map = new HashMap<>(); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files + .createTempDirectory(SelectResultAndDumpRelationTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(SelectResultAndDumpRelationTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(SelectResultAndDumpRelationTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void test1() throws Exception { + + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); + + SparkSelectResultsAndDumpRelations.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-communities", "enermaps;dh-ch", + "-sourcePath", sourcePath + }); + + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + + Assertions.assertEquals(2,sc.textFile(workingDir.toString() + "/working/communityResultIds").count()); + + Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/communityResultIds") + .filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")).count()); + + Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/communityResultIds") + .filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).count()); + + + //verify that the association is correct with the communityid and result id + spark.read().option("header", "true").option("delimiter",Constants.SEP).csv(workingDir.toString() + "/output/result_community") + .createOrReplaceTempView("result_community"); + + Assertions.assertEquals(3, spark.sql("SELECT * FROM result_community").count()); + + Assertions.assertEquals(1, spark.sql("SELECT * " + + "FROM result_community " + + "WHERE community_id = '" + DHPUtils.md5("dh-ch") + "'").count()); + + Assertions.assertEquals(1, spark.sql("SELECT * " + + "FROM result_community" + + " WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " + + "AND community_id = '" + DHPUtils.md5("dh-ch") + "'").count()); + + Assertions.assertEquals(2, spark.sql("SELECT * " + + "FROM result_community " + + "WHERE community_id = '" + DHPUtils.md5("enermaps") + "'").count()); + Assertions.assertEquals(1, spark.sql("SELECT * " + + "FROM result_community " + + "WHERE result_id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9' " + + "AND community_id = '" + DHPUtils.md5("enermaps") + "'").count()); + Assertions.assertEquals(1, spark.sql("SELECT * " + + "FROM result_community " + + "WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " + + "AND community_id = '" + DHPUtils.md5("enermaps") + "'").count()); + + + Assertions.assertEquals(3, spark.read().textFile(workingDir.toString() + "/working/resultIds").count()); + + Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds") + .filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")).count()); + + Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds") + .filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).count()); + + Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds") + .filter(v -> v.equals("50|DansKnawCris::26780065282e607306372abd0d808245")).count()); + + spark.read().option("header", "true").option("delimiter",Constants.SEP).csv(workingDir.toString() + "/output/relation") + .createOrReplaceTempView("relation"); + + Assertions.assertEquals(2, spark.sql("SELECT * FROM relation").count()); + + Assertions.assertEquals(1, spark.sql("SELECT * FROM relation WHERE id = '" + + DHPUtils.md5(("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::26780065282e607306372abd0d808245")) + "'").count()); + + Assertions.assertEquals(1, spark.sql("SELECT * FROM relation WHERE id = '" + + DHPUtils.md5(("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) + "'").count()); + + + } + +} diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/dataset b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/dataset new file mode 100644 index 0000000..e69de29 diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/otherresearchproduct b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/otherresearchproduct new file mode 100644 index 0000000..e69de29 diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication new file mode 100644 index 0000000..4983021 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication @@ -0,0 +1,4 @@ +{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}, {"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"},{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"dh-ch"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"dateofcollection":"","dateoftransformation":"2020-05-25T16:14:18.452Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Lit.opg., bijl."}],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"nl","classname":"nl","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282676557,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:51:24Z","harvestDate":"2020-05-25T11:33:13.427Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550013110","metadataNamespace":""}},"originalId":["DansKnawCris::0224aae28af558f21768dbc6439c7a95"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550013110"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550013110"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"prospectie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Synthegra Archeologie Rapportenreeks P0502381"}],"journal":null} +{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:03:57.761Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591283087415,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:58:39Z","harvestDate":"2020-05-25T11:34:38.707Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce-kb:document:800020324","metadataNamespace":""}},"originalId":["DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceFullCatalogue&search=priref=800020324"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceFullCatalogue&search=priref=800020324"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} +{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d808245","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} +{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d80fake","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/relation b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/relation new file mode 100644 index 0000000..1a9a370 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/relation @@ -0,0 +1,4 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"Cites","relType":"datasourceOrganization","source":"50|DansKnawCris::26780065282e607306372abd0d808245","subRelType":"provision","target":"50|DansKnawCris::26780065282e607306372abd0d808246"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"Cites","relType":"datasourceOrganization","source":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","subRelType":"provision","target":"50|DansKnawCris::26780065282e607306372abd0d808245"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"Cites","relType":"datasourceOrganization","source":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","subRelType":"provision","target":"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/software b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/software new file mode 100644 index 0000000..e69de29 From 7563499740d55e9469ac4fa4bdc84ff41c6b51cb Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 16 May 2023 14:29:31 +0200 Subject: [PATCH 08/19] [DUMP CSV] - --- .../dhp/oa/graph/dump/csv/AuthorResult.java | 130 +++++++++--------- .../csv/{Constats.java => Constants.java} | 5 +- 2 files changed, 68 insertions(+), 67 deletions(-) rename dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/{Constats.java => Constants.java} (58%) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java index 11a2f0e..dcc89c5 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java @@ -1,93 +1,93 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv; -import eu.dnetlib.dhp.utils.DHPUtils; - import java.io.Serializable; +import eu.dnetlib.dhp.utils.DHPUtils; + /** * @author miriam.baglioni * @Date 05/05/23 */ public class AuthorResult implements Serializable { - private String authorId; - private String firstName; - private String lastName; - private String fullName; - private String orcid; - private String resultId; - private String rank; + private String authorId; + private String firstName; + private String lastName; + private String fullName; + private String orcid; + private String resultId; + private String rank; + public String getFullName() { + return fullName; + } - public String getFullName() { - return fullName; - } + public void setFullName(String fullName) { + this.fullName = fullName; + } - public void setFullName(String fullName) { - this.fullName = fullName; - } + public String getAuthorId() { + return authorId; + } - public String getAuthorId() { - return authorId; - } + public void setAuthorId(String authorId) { + this.authorId = authorId; + } - public void setAuthorId(String authorId) { - this.authorId = authorId; - } + public String getResultId() { + return resultId; + } - public String getResultId() { - return resultId; - } + public void setResultId(String resultId) { + this.resultId = resultId; + } - public void setResultId(String resultId) { - this.resultId = resultId; - } + public String getRank() { + return rank; + } - public String getRank() { - return rank; - } + public void setRank(String rank) { + this.rank = rank; + } - public void setRank(String rank) { - this.rank = rank; - } + public String getId() { + return authorId; + } - public String getId() { - return authorId; - } + public void setId(String id) { + this.authorId = id; + } - public void setId(String id) { - this.authorId = id; - } + public String getFirstName() { + return firstName; + } - public String getFirstName() { - return firstName; - } + public void setFirstName(String firstName) { + this.firstName = firstName; + } - public void setFirstName(String firstName) { - this.firstName = firstName; - } + public String getLastName() { + return lastName; + } - public String getLastName() { - return lastName; - } + public void setLastName(String lastName) { + this.lastName = lastName; + } - public void setLastName(String lastName) { - this.lastName = lastName; - } + public String getOrcid() { + return orcid; + } - public String getOrcid() { - return orcid; - } + public void setOrcid(String orcid) { + this.orcid = orcid; + } - public void setOrcid(String orcid) { - this.orcid = orcid; - } + public void autosetId() { + if (orcid != null) { + authorId = DHPUtils.md5(orcid); + } else { + authorId = DHPUtils.md5(resultId + firstName + lastName + rank); + } - public void autosetId() { - if(orcid != null){ - authorId = DHPUtils.md5(orcid); - }else{ - authorId = DHPUtils.md5(resultId + firstName + lastName + rank); - } - - } + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constats.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constants.java similarity index 58% rename from dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constats.java rename to dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constants.java index cb9d5b0..cfb1fd7 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constats.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constants.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv; import java.io.Serializable; @@ -6,6 +7,6 @@ import java.io.Serializable; * @author miriam.baglioni * @Date 10/05/23 */ -public class Constats implements Serializable { - public final static String SEP = "\t"; +public class Constants implements Serializable { + public final static String SEP = "\t"; } From 66873c1744a621ef957e6671ba9b68e0f9b757a9 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 17 May 2023 16:56:28 +0200 Subject: [PATCH 09/19] [DUMP CSV] Dumping of the results, of the authors and the relationships between results and authors and results and pids --- .../dhp/oa/graph/dump/csv/AuthorResult.java | 2 +- .../oa/graph/dump/csv/SparkDumpResults.java | 502 +++++++++--------- .../oa/graph/dump/input_dump_csv_ste3.json | 41 ++ .../graph/dump/csv/DumpCommunitiesTest.java | 9 + .../dhp/oa/graph/dump/csv/DumpResultTest.java | 9 + .../dump/csv/working/resultIds/part-00000 | 0 .../dump/csv/working/resultIds/part-00049 | 1 + .../dump/csv/working/resultIds/part-00089 | 1 + .../dump/csv/working/resultIds/part-00169 | 1 + 9 files changed, 319 insertions(+), 247 deletions(-) create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json create mode 100644 dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunitiesTest.java create mode 100644 dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00000 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00049 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00089 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00169 diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java index dcc89c5..d4c350a 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java @@ -86,7 +86,7 @@ public class AuthorResult implements Serializable { if (orcid != null) { authorId = DHPUtils.md5(orcid); } else { - authorId = DHPUtils.md5(resultId + firstName + lastName + rank); + authorId = DHPUtils.md5(resultId + rank); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java index 4270e1b..c416334 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java @@ -1,5 +1,25 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static org.apache.commons.lang3.StringUtils.split; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collector; +import java.util.stream.Collectors; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.*; +import org.apache.spark.sql.*; +import org.apache.spark.sql.Dataset; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor; @@ -7,282 +27,272 @@ import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid; import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; - import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.utils.DHPUtils; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.StringUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.MapGroupsFunction; -import org.apache.spark.sql.*; -import org.apache.spark.sql.Dataset; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import scala.Tuple2; -import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; - - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - /** * @author miriam.baglioni * @Date 04/05/23 */ +//STEP 3 public class SparkDumpResults implements Serializable { + private static final Logger log = LoggerFactory.getLogger(SparkDumpResults.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkDumpResults.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String resultType = parser.get("resultType"); + log.info("resultType: {}", resultType); + + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + final String workingPath = parser.get("workingPath"); + + Class inputClazz = (Class) Class.forName(resultClassName); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath); + run(spark, inputPath, outputPath, inputClazz, resultType, workingPath); + + }); + + } + + private static void run(SparkSession spark, String inputPath, String outputPath, + Class inputClazz, String resultType, String workingPath) { + + Dataset resultIds = spark.read().textFile(workingPath + "/resultIds"); + Dataset results = Utils + .readPath(spark, inputPath + "/" + resultType, inputClazz) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible()); + + // map results + resultIds + .joinWith(results, resultIds.col("value").equalTo(results.col("id"))) + .map( + (MapFunction, CSVResult>) t2 -> mapResultInfo(t2._2()), + Encoders.bean(CSVResult.class)) + .write() + .option("compression", "gzip") + .option("header","true") + .option("delimiter",Constants.SEP) + .mode(SaveMode.Overwrite) + .csv(workingPath + "/" + resultType + "/result"); + + // map relations between pid and result + resultIds + .joinWith(results, resultIds.col("value").equalTo(results.col("id"))) + .flatMap((FlatMapFunction, CSVPid>) t2 -> { + List pids = new ArrayList<>(); + if (Optional.ofNullable(t2._2().getPid()).isPresent() && t2._2().getPid().size() > 0) { + pids.addAll(mapPid(t2._2().getPid(), t2._1())); + } + return pids.iterator(); + }, Encoders.bean(CSVPid.class)) + .filter(Objects::nonNull) + .write() + .option("compression", "gzip") + .option("header","true") + .option("delimiter", Constants.SEP) + .mode(SaveMode.Overwrite) + .csv(workingPath + "/" + resultType + "/result_pid"); + + // map authors from the result + // per ogni autore nel result + // se l'autore ha un orcid il suo id dipende dall'orcid (tipo md5(orcid)) + // se non ha orcid il suo id si costruisce come result_id + authorrank ( se non ha il rank si sua + // la sua posizione nell'insieme degli autori) sempre con md5 + Dataset authorResult = resultIds + .joinWith(results, resultIds.col("value").equalTo(results.col("id"))) + .flatMap((FlatMapFunction, AuthorResult>) t2 -> { + int count = 0; + List arl = new ArrayList<>(); + for (Author a : t2._2().getAuthor()) { + count += 1; + AuthorResult ar = new AuthorResult(); + ar.setResultId(t2._1()); + if (Optional.ofNullable(a.getRank()).isPresent()) { + if (a.getRank() > 0) { + ar.setRank(String.valueOf(a.getRank())); + } else { + ar.setRank(String.valueOf(count)); + } + } + ar.setFirstName(a.getName()); + ar.setLastName(a.getSurname()); + ar.setFullName(a.getFullname()); + ar.setOrcid(getOrcid(a.getPid())); + ar.autosetId(); + arl.add(ar); + } + return arl.iterator(); + }, Encoders.bean(AuthorResult.class)); - private static final Logger log = LoggerFactory.getLogger(SparkDumpResults.class); + // map the relation between author and result + authorResult + .map( + (MapFunction) ar -> { + CSVRelResAut ret = new CSVRelResAut(); + ret.setResult_id(ar.getResultId() ); + ret.setAuthor_id( ar.getAuthorId()); + return ret; + }, + Encoders.bean(CSVRelResAut.class)) + .write() + .option("compression", "gzip") + .option("header","true") + .option("delimiter",Constants.SEP) + .mode(SaveMode.Overwrite) + .csv(workingPath + "/" + resultType + "/result_author"); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - SparkDumpResults.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step2.json")); + // ma the authors in the working dir. I do not want to have them repeated + authorResult + .groupByKey((MapFunction) ar -> ar.getAuthorId(), Encoders.STRING()) + .mapGroups( + (MapGroupsFunction) (k, it) -> getAuthorDump(it.next()), + Encoders.bean(CSVAuthor.class)) + .write() + .option("compression", "gzip") + .option("header","true") + .option("delimiter",Constants.SEP) + .mode(SaveMode.Overwrite) + .csv(workingPath + "/" + resultType + "/author"); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + } - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + private static List mapPid(List pid, String resultId) { + return pid.stream().map(p -> p.getQualifier().getClassid().toLowerCase() + "@" + p.getValue().toLowerCase()).distinct().map(p -> { + CSVPid ret = new CSVPid(); + ret.setId(DHPUtils.md5(p)); + ret.setResult_id(resultId); + ret.setPid(split(p, "@")[1]); + ret.setType(split(p, "@")[0]); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + return ret; + }).collect(Collectors.toList()); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + } - final String resultType = parser.get("resultType"); - log.info("resultType: {}", resultType); + private static CSVAuthor getAuthorDump(AuthorResult ar) { + CSVAuthor ret = new CSVAuthor(); + ret.setFirstname(ar.getFirstName()); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + ret.setId(ar.getAuthorId()); + ret.setLastname(ar.getLastName()); - final String workingPath = parser.get("workingPath"); + ret.setFullname(ar.getFullName()); - Class inputClazz = (Class) Class.forName(resultClassName); + if (ar.getOrcid() != null) { + ret.setOrcid(ar.getOrcid()); + } else { + ret.setOrcid(""); + } - SparkConf conf = new SparkConf(); + return ret; + } - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath ); - run(spark, inputPath, outputPath, inputClazz, resultType, workingPath); + private static String getOrcid(List pid) { + if (!Optional.ofNullable(pid).isPresent()) + return null; + if (pid.size() == 0) + return null; + for (StructuredProperty p : pid) { + if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) { + return p.getValue(); + } + } + return null; + } - }); + private static String getFieldValue(Field input){ + if (input != null && + StringUtils.isNotEmpty(input.getValue())) { + return input.getValue(); + } else { + return ""; + } + } + private static CSVResult mapResultInfo(R r) { + CSVResult ret = new CSVResult(); + ret.setId(r.getId()); + ret.setType(r.getResulttype().getClassid()); + ret.setTitle(getTitle(r.getTitle())); + ret.setDescription(getAbstract(r.getDescription())); + ret.setAccessright(r.getBestaccessright().getClassid()); + ret.setPublication_date(getFieldValue(r.getDateofacceptance())); + ret.setPublisher(getFieldValue(r.getPublisher())); - } - - private static void run(SparkSession spark, String inputPath, String outputPath, - Class inputClazz, String resultType, String workingPath) { - - Dataset resultIds = spark.read().textFile(workingPath + "/resultIds"); - Dataset results = Utils - .readPath(spark, inputPath + "/" + resultType, inputClazz) - .filter((FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible()); - - // map results - resultIds.joinWith(results, resultIds.col("value").equalTo(results.col("id"))) - .map((MapFunction, CSVResult>) t2 -> mapResultInfo(t2._2()), Encoders.bean(CSVResult.class) ) - .write() - .option("compression","gzip") - .mode(SaveMode.Overwrite) - .csv(workingPath + "/" + resultType + "/result"); + ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> { + if (StringUtils.isNotEmpty(s.getValue())) + return s.getValue().toLowerCase(); + else + return null;}).filter(Objects::nonNull).distinct().collect(Collectors.toList()))); - // map relations between pid and result - resultIds.joinWith(results, resultIds.col("value").equalTo(results.col("id"))) - .flatMap((FlatMapFunction, CSVPid>) t2 -> - { - List pids = new ArrayList<>(); - if(Optional.ofNullable(t2._2().getPid()).isPresent() && t2._2().getPid().size() > 0){ - pids.addAll(mapPid(t2._2().getPid(), t2._1())); - } - return pids.iterator(); - }, Encoders.bean(CSVPid.class)) - .filter(Objects::nonNull) - .write() - .option("compression","gzip") - .mode(SaveMode.Overwrite) - .csv(workingPath + "/" + resultType + "/result_pid"); + ret.setCountry(String.join(", ", r.getCountry().stream().map(Country::getClassid).collect(Collectors.toList()))); + if (StringUtils.isNotEmpty(r.getLanguage().getClassid())) { + ret.setLanguage(r.getLanguage().getClassid()); + } else { + ret.setLanguage(""); + } - //map authors from the result - //per ogni autore nel result - //se l'autore ha un orcid il suo id dipende dall'orcid (tipo md5(orcid)) - //se non ha orcid il suo id si costruisce come result_id + author_name + authorrank ( se non ha il rank si sua - //la sua posizione nell'insieme degli autori) sempre con md5 - Dataset authorResult = resultIds.joinWith(results, resultIds.col("value").equalTo(results.col("id"))) - .flatMap((FlatMapFunction, AuthorResult>) t2 -> - { - int count = 0; - List arl = new ArrayList<>(); - for (Author a : t2._2().getAuthor()) { - count += 1; - AuthorResult ar = new AuthorResult(); - ar.setResultId(t2._1()); - if (Optional.ofNullable(a.getRank()).isPresent()) { - if (a.getRank() > 0) { - ar.setRank(String.valueOf(a.getRank())); - } else { - ar.setRank(String.valueOf(count)); - } - } - ar.setFirstName(a.getName()); - ar.setLastName(a.getSurname()); - ar.setFullName(a.getFullname()); - ar.setOrcid(getOrcid(a.getPid())); - ar.autosetId(); - arl.add(ar); - } - return arl.iterator(); - } - , Encoders.bean(AuthorResult.class)); - - // map the relation between author and result - authorResult.map((MapFunction) ar -> ar.getResultId() + Constats.SEP + ar.getAuthorId(), Encoders.STRING() ) - .write() - .option("compression","gzip") - .mode(SaveMode.Overwrite) - .csv(workingPath + "/" + resultType + "/result_author"); - - - // ma the authors in the working dir. I do not want to have them repeated - authorResult.groupByKey((MapFunction) ar -> ar.getAuthorId(), Encoders.STRING() ) - .mapGroups((MapGroupsFunction) (k, it) -> getAuthorDump(it.next()) , Encoders.bean(CSVAuthor.class)) - .write() - .option("compression", "gzip") - .mode(SaveMode.Overwrite) - .csv(workingPath + "/" + resultType + "/author"); - - } - - private static List mapPid(List pid, String resultId) { - return pid.stream().map(p -> { - CSVPid ret = new CSVPid(); - - ret.setId(DHPUtils.md5(p.getQualifier().getClassid() + p.getValue())); - ret.setResult_id(resultId); - ret.setPid(p.getValue()); - ret.setType(p.getQualifier().getClassid()); - - return ret; - }).collect(Collectors.toList()); - - } - - private static CSVAuthor getAuthorDump(AuthorResult ar) { - CSVAuthor ret = new CSVAuthor(); - ret.setFirstname(ar.getFirstName()); - - ret.setId(ar.getAuthorId()); - ret.setLastname(ar.getLastName()); - - ret.setFullname(ar.getFullName()); - - if(ar.getOrcid() != null){ - ret.setOrcid(ar.getOrcid()); - }else{ - ret.setOrcid(""); - } - - return ret; - } - - private static String getOrcid(List pid) { - if(!Optional.ofNullable(pid).isPresent()) - return null; - if(pid.size() == 0) - return null; - for(StructuredProperty p : pid){ - if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)){ - return p.getValue(); - } - } - return null; - } - - - - private static CSVResult mapResultInfo(R r) { - CSVResult ret = new CSVResult(); - ret.setId(r.getId()); - ret.setType(r.getResulttype().getClassid()); - ret.setTitle(getTitle(r.getTitle())); - ret.setDescription(getAbstract(r.getDescription())); - ret.setAccessright(r.getBestaccessright().getClassid()); - ret.setPublication_date(r.getDateofacceptance().getValue()); - if (StringUtils.isNotEmpty(r.getPublisher().getValue())) { - ret.setPublisher(r.getPublisher().getValue()); - } else { - ret.setPublisher(""); - } - - StringBuilder sbjs = new StringBuilder(); - for(StructuredProperty sbj : r.getSubject()){ - if(StringUtils.isNotEmpty(sbj.getValue())){ - sbjs.append(sbj.getValue()); - sbjs.append(","); - } - } - ret.setKeywords(sbjs.toString()); - - StringBuilder countries = new StringBuilder(); - - for(Country c: r.getCountry()){ - if(StringUtils.isNotEmpty(c.getClassid())){ - countries.append(c.getClassid()); - } - } - ret.setCountry(countries.toString()); - - if(StringUtils.isNotEmpty(r.getLanguage().getClassid())){ - ret.setLanguage(r.getLanguage().getClassid()); - }else{ - ret.setLanguage(""); - } - - return ret; - } - - private static String getAbstract(List> description) { - for(Field abs:description){ - if(StringUtils.isNotEmpty(abs.getValue())){ - return abs.getValue(); - } - } - return ""; - } - - - private static String getTitle(List titles) { - String firstTitle = null; - for(StructuredProperty title : titles){ - if(StringUtils.isEmpty(firstTitle)){ - if(StringUtils.isNotEmpty(title.getValue())) - firstTitle = title.getValue(); - } - if(title.getQualifier().getClassid().equals(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid())){ - if(StringUtils.isNotEmpty(title.getValue())) - return title.getValue(); - } - } - return ""; - } - + return ret; + } + private static String getAbstract(List> description) { + if(description == null) + return ""; + for (Field abs : description) { + if (StringUtils.isNotEmpty(abs.getValue())) { + return abs.getValue(); + } + } + return ""; + } + private static String getTitle(List titles) { + String firstTitle = null; + for (StructuredProperty title : titles) { + if (StringUtils.isEmpty(firstTitle)) { + if (StringUtils.isNotEmpty(title.getValue())) + firstTitle = title.getValue(); + } + if (title.getQualifier().getClassid().equals(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid())) { + if (StringUtils.isNotEmpty(title.getValue())) + return title.getValue(); + } + } + return ""; + } } diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json new file mode 100644 index 0000000..87a01d8 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json @@ -0,0 +1,41 @@ +[ + + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName":"wp", + "paramLongName":"workingPath", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName":"rt", + "paramLongName":"resultType", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName":"rtn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + } +] + + diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunitiesTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunitiesTest.java new file mode 100644 index 0000000..605f1ec --- /dev/null +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunitiesTest.java @@ -0,0 +1,9 @@ + +package eu.dnetlib.dhp.oa.graph.dump.csv; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class DumpCommunitiesTest { +} diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java new file mode 100644 index 0000000..fc40bd5 --- /dev/null +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java @@ -0,0 +1,9 @@ + +package eu.dnetlib.dhp.oa.graph.dump.csv; + +/** + * @author miriam.baglioni + * @Date 11/05/23 + */ +public class DumpResultTest { +} diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00000 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00000 new file mode 100644 index 0000000..e69de29 diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00049 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00049 new file mode 100644 index 0000000..6a5ffaf --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00049 @@ -0,0 +1 @@ +50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9 diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00089 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00089 new file mode 100644 index 0000000..07020d0 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00089 @@ -0,0 +1 @@ +50|DansKnawCris::0224aae28af558f21768dbc6439c7a95 diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00169 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00169 new file mode 100644 index 0000000..7ceab26 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00169 @@ -0,0 +1 @@ +50|DansKnawCris::26780065282e607306372abd0d808245 From 21599598ae66fe4bb23f3e8c536de81a21aaf16b Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 17 May 2023 16:57:25 +0200 Subject: [PATCH 10/19] [DUMP CSV] test and resources for the result dumps --- .../dhp/oa/graph/dump/csv/DumpResultTest.java | 260 ++++++++++++++++++ 1 file changed, 260 insertions(+) diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java index fc40bd5..dba7b94 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java @@ -1,9 +1,269 @@ package eu.dnetlib.dhp.oa.graph.dump.csv; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Optional; + +import static org.apache.commons.lang3.StringUtils.split; + /** * @author miriam.baglioni * @Date 11/05/23 */ public class DumpResultTest { + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + private static final Logger log = LoggerFactory + .getLogger(DumpResultTest.class); + + private static HashMap map = new HashMap<>(); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files + .createTempDirectory(DumpResultTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(DumpResultTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(DumpResultTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void testDumpResult() throws Exception { + + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); + + spark.read().text(getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); + + SparkDumpResults.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-resultType", "publication", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-sourcePath", sourcePath + }); + + + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + Dataset tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/working/publication/result"); + + Assertions.assertEquals(3, tmp.count()); + Row row = tmp + .where("id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'") + .first(); + Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright")); + Assertions.assertEquals("FI" ,row.getAs("country")); + Assertions.assertEquals("Lit.opg., bijl." ,row.getAs("description")); + Assertions.assertEquals(3 ,split(row.getAs("keywords"), ", ").length); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); + Assertions.assertTrue(row.getAs("keywords").toString().contains("prospectie")); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); + Assertions.assertEquals("nl", row.getAs("language")); + Assertions.assertEquals("2007-01-01", row.getAs("publication_date")); + Assertions.assertEquals("FakePublisher1", row.getAs("publisher")); + Assertions.assertEquals("Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel", row.getAs("title")); + Assertions.assertEquals("publication", row.getAs("type")); + + row = tmp + .where("id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'") + .first(); + Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright")); + Assertions.assertEquals(2 ,split(row.getAs("country"), ", ").length); + Assertions.assertNull(row.getAs("description")); + Assertions.assertEquals(2 ,split(row.getAs("keywords"), ", ").length); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); + Assertions.assertEquals("UNKNOWN", row.getAs("language")); + Assertions.assertNull( row.getAs("publication_date")); + Assertions.assertNull( row.getAs("publisher")); + Assertions.assertEquals("None", row.getAs("title")); + Assertions.assertEquals("publication", row.getAs("type")); + + row = tmp + .where("id = '50|DansKnawCris::26780065282e607306372abd0d808245'") + .first(); + Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright")); + Assertions.assertNull(row.getAs("country")); + Assertions.assertNull(row.getAs("description")); + Assertions.assertEquals(2 ,split(row.getAs("keywords"), ", ").length); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); + Assertions.assertEquals("UNKNOWN", row.getAs("language")); + Assertions.assertNull( row.getAs("publication_date")); + Assertions.assertNull( row.getAs("publisher")); + Assertions.assertEquals("None", row.getAs("title")); + Assertions.assertEquals("publication", row.getAs("type")); + + + } + + @Test + public void testDumpAuthor() throws Exception { + + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); + + spark.read().text(getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); + + SparkDumpResults.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-resultType", "publication", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-sourcePath", sourcePath + }); + + + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + Dataset tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/working/publication/author"); + + Assertions.assertEquals(5, tmp.count()); + + Assertions.assertEquals(1,tmp.where("firstName == 'Maryam'").count()); + + Assertions.assertEquals(DHPUtils.md5("50|DansKnawCris::0224aae28af558f21768dbc6439c7a951"),tmp.where("firstName == 'Maryam'").first().getAs("id")); + Assertions.assertEquals(DHPUtils.md5("0000-0003-2914-2734"),tmp.where("firstName == 'Michael'").first().getAs("id")); + Assertions.assertEquals(DHPUtils.md5("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d92"),tmp.where("firstName == 'Mikhail'").first().getAs("id")); + + + } + + @Test + public void testDumpResultAuthorRelations() throws Exception { + + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); + + spark.read().text(getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); + + SparkDumpResults.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-resultType", "publication", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-sourcePath", sourcePath + }); + + + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + Dataset tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/working/publication/result_author"); + + Assertions.assertEquals(6, tmp.count()); + + Assertions.assertEquals(2, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'").count()); + Assertions.assertEquals(1, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'") + .where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'").count()); + Assertions.assertEquals(1, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'") + .where("result_id == '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'").count()); + + + } + + @Test + public void testDumpResultPid() throws Exception { + + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); + + spark.read().text(getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); + + SparkDumpResults.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-resultType", "publication", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-sourcePath", sourcePath + }); + + + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + Dataset tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/working/publication/result_pid"); + + tmp.show(false); + Assertions.assertEquals(4, tmp.count()); + + Assertions.assertEquals(2, tmp.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'").count()); + Assertions.assertEquals("10.1023/fakedoi", tmp.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' and type == 'doi'").first().getAs("pid")); + + + } } From f79b9d5c0d257571c27e8afc3d34f6c941b9f94a Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 17 May 2023 16:58:04 +0200 Subject: [PATCH 11/19] [DUMP CSV] sligth modification --- .../oa/graph/dump/csv/SelectResultAndDumpRelationTest.java | 3 --- .../eu/dnetlib/dhp/oa/graph/dump/csv/input/publication | 6 +++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java index 1891a67..767267c 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java @@ -26,9 +26,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.dump.complete.SelectRelationTest; -import eu.dnetlib.dhp.oa.graph.dump.complete.SparkSelectValidRelationsJob; -import eu.dnetlib.dhp.schema.oaf.Relation; /** * @author miriam.baglioni diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication index 4983021..2b003e7 100644 --- a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication @@ -1,4 +1,4 @@ -{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}, {"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"},{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"dh-ch"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"dateofcollection":"","dateoftransformation":"2020-05-25T16:14:18.452Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Lit.opg., bijl."}],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"nl","classname":"nl","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282676557,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:51:24Z","harvestDate":"2020-05-25T11:33:13.427Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550013110","metadataNamespace":""}},"originalId":["DansKnawCris::0224aae28af558f21768dbc6439c7a95"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550013110"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550013110"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"prospectie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Synthegra Archeologie Rapportenreeks P0502381"}],"journal":null} -{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:03:57.761Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591283087415,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:58:39Z","harvestDate":"2020-05-25T11:34:38.707Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce-kb:document:800020324","metadataNamespace":""}},"originalId":["DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceFullCatalogue&search=priref=800020324"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceFullCatalogue&search=priref=800020324"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} -{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d808245","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} +{"author":[{"affiliation":[],"fullname":"Alrasheed, Maryam","name":"Maryam","pid":[],"rank":1,"surname":"Alrasheed"},{"affiliation":[],"fullname":"Blondin, Michael","name":"Michael","pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0003-2914-2734"}],"rank":1,"surname":"Blondin"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}, {"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"},{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"dh-ch"}],"contributor":[],"country":[{"classid":"FI","classname":"Finland","dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"country:instrepos","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"schemeid":"dnet:countries","schemename":"dnet:countries"}],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"dateofcollection":"","dateoftransformation":"2020-05-25T16:14:18.452Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Lit.opg., bijl."}],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"nl","classname":"nl","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282676557,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:51:24Z","harvestDate":"2020-05-25T11:33:13.427Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550013110","metadataNamespace":""}},"originalId":["DansKnawCris::0224aae28af558f21768dbc6439c7a95"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550013110"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1023/fakedoi"}],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"FakePublisher1"},"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"prospectie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Synthegra Archeologie Rapportenreeks P0502381"}],"journal":null} +{"author":[{"affiliation":[],"fullname":"Blondin, Michael","name":"Michael","pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0003-2914-2734"}],"rank":1,"surname":"Blondin"},{"affiliation":[],"fullname":"Raskin, Mikhail","name":"Mikhail","pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid_pending","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0002-6660-5673"}],"rank":2,"surname":"Raskin"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[{"classid":"IT","classname":"Finland","dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"country:instrepos","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"schemeid":"dnet:countries","schemename":"dnet:countries"},{"classid":"FI","classname":"Finland","dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"country:instrepos","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"schemeid":"dnet:countries","schemename":"dnet:countries"}],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:03:57.761Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591283087415,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:58:39Z","harvestDate":"2020-05-25T11:34:38.707Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce-kb:document:800020324","metadataNamespace":""}},"originalId":["DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceFullCatalogue&search=priref=800020324"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} +{"author":[{"affiliation":[],"fullname":"Ward, Mark Daniel","name":"Mark Daniel","pid":[],"rank":1,"surname":"Ward"},{"affiliation":[],"fullname":"Szpankowski, Wojciech","name":"Wojciech","pid":[],"rank":2,"surname":"Szpankowski"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d808245","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} {"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d80fake","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} \ No newline at end of file From 2e0999a1dfd1db545ac7c0601926220f60708e13 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 29 May 2023 10:16:47 +0200 Subject: [PATCH 12/19] First implementation of the csv dump --- .../oa/graph/dump/QueryInformationSystem.java | 13 +- .../dhp/oa/graph/dump/csv/AuthorResult.java | 11 +- .../oa/graph/dump/csv/DumpCommunities.java | 147 +++--- .../oa/graph/dump/csv/SparkDumpResults.java | 179 ++++--- .../graph/dump/csv/SparkMoveOnSigleDir.java | 166 ++++--- .../SparkSelectResultsAndDumpRelations.java | 30 +- .../oa/graph/dump/csv/model/CSVAuthor.java | 9 + .../oa/graph/dump/csv/oozie_app/workflow.xml | 126 ++--- .../oa/graph/dump/input_dump_csv_ste1.json | 30 ++ .../oa/graph/dump/input_dump_csv_ste3.json | 7 +- .../oa/graph/dump/input_dump_csv_ste4.json | 25 + .../graph/dump/xqueries/set_of_communities.xq | 2 +- .../dhp/oa/graph/dump/csv/DumpResultTest.java | 458 ++++++++++-------- .../graph/dump/csv/MoveOnSingleDirTest.java | 117 +++++ .../csv/SelectResultAndDumpRelationTest.java | 158 ++++-- .../dump/csv/working/dataset/author/part0 | 4 + .../dump/csv/working/dataset/result/part0 | 12 + .../csv/working/dataset/result_author/part0 | 19 + .../dump/csv/working/dataset/result_pid/part0 | 33 ++ .../working/otherresearchproduct/author/part1 | 2 + .../working/otherresearchproduct/result/part0 | 4 + .../otherresearchproduct/result_author/part0 | 17 + .../otherresearchproduct/result_pid/part0 | 5 + .../dump/csv/working/publication/author/part0 | 4 + .../dump/csv/working/publication/result/part0 | 4 + .../working/publication/result_author/part0 | 17 + .../csv/working/publication/result_pid/part0 | 12 + .../dump/csv/working/software/author/part1 | 2 + .../dump/csv/working/software/result/part0 | 1 + .../csv/working/software/result_author/part0 | 2 + .../csv/working/software/result_pid/part0 | 2 + 31 files changed, 1090 insertions(+), 528 deletions(-) create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste1.json create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste4.json create mode 100644 dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/MoveOnSingleDirTest.java create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/author/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result_author/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result_pid/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/author/part1 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result_author/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result_pid/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/author/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result_author/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result_pid/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/author/part1 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result_author/part0 create mode 100644 dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result_pid/part0 diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java index 29ccab4..8ca73ea 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/QueryInformationSystem.java @@ -10,10 +10,13 @@ import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; +import eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities; import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -22,6 +25,8 @@ public class QueryInformationSystem { private ISLookUpService isLookUp; + private static final Logger log = LoggerFactory.getLogger(QueryInformationSystem.class); + private static final String XQUERY_ALL = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " + @@ -79,6 +84,7 @@ public class QueryInformationSystem { List communities = new ArrayList<>(); for (String xml : isLookUp.quickSearchProfile(toString)) { + log.info(xml); final Document doc; final SAXReader reader = new SAXReader(); reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); @@ -91,7 +97,12 @@ public class QueryInformationSystem { builder.append(Constants.SEP); builder.append(root.attribute("id").getValue()); builder.append(Constants.SEP); - builder.append(((Node) (root.selectNodes("/description").get(0))).getText()); + builder + .append( + ((Node) (root.selectNodes("//description").get(0))) + .getText() + .replace("\n", " ") + .replace("\t", " ")); communities.add(builder.toString()); } return communities; diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java index d4c350a..1628fdd 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/AuthorResult.java @@ -17,6 +17,15 @@ public class AuthorResult implements Serializable { private String orcid; private String resultId; private String rank; + private Boolean fromOrcid; + + public Boolean getFromOrcid() { + return fromOrcid; + } + + public void setFromOrcid(Boolean fromOrcid) { + this.fromOrcid = fromOrcid; + } public String getFullName() { return fullName; @@ -86,7 +95,7 @@ public class AuthorResult implements Serializable { if (orcid != null) { authorId = DHPUtils.md5(orcid); } else { - authorId = DHPUtils.md5(resultId + rank); + authorId = DHPUtils.md5(resultId + rank); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunities.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunities.java index 94111e6..ebbadaa 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunities.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpCommunities.java @@ -1,11 +1,19 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem; -import eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap; -import eu.dnetlib.dhp.oa.graph.dump.Utils; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static org.apache.commons.lang3.StringUtils.split; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; @@ -18,91 +26,94 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Serializable; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem; +import eu.dnetlib.dhp.oa.graph.dump.SaveCommunityMap; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; /** * @author miriam.baglioni * @Date 09/05/23 */ +//STEP 1 public class DumpCommunities implements Serializable { - private static final Logger log = LoggerFactory.getLogger(DumpCommunities.class); - private final BufferedWriter writer; + private static final Logger log = LoggerFactory.getLogger(DumpCommunities.class); + private final BufferedWriter writer; + private final static String HEADER = "id" + Constants.SEP + "name" + Constants.SEP + "acronym" + Constants.SEP + + " description \n"; + private final transient QueryInformationSystem queryInformationSystem; - private final transient QueryInformationSystem queryInformationSystem; + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + DumpCommunities.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste1.json")); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - DumpCommunities.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step3.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + final String nameNode = parser.get("nameNode"); + log.info("nameNode: {}", nameNode); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + final List communities = Arrays.asList(split(parser.get("communities"), ";")); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final DumpCommunities dc = new DumpCommunities(outputPath, nameNode, parser.get("isLookUpUrl")); - final String workingPath = parser.get("workingPath"); + dc.writeCommunity(communities); - final String nameNode = parser.get("nameNode"); - log.info("nameNode: {}", nameNode); + } + private void writeCommunity(List communities) + throws IOException, ISLookUpException, DocumentException, SAXException { + writer.write(HEADER); + writer.flush(); + String a = IOUtils + .toString( + DumpCommunities.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq")); - final DumpCommunities dc = new DumpCommunities(outputPath, nameNode, parser.get("isLookUp)")); + final String xquery = String + .format( + a, + communities + .stream() + .map(t -> String.format("$x//CONFIGURATION/context[./@id= '%s']", t)) + .collect(Collectors.joining(" or "))); - dc.writeCommunity(); + for (String community : queryInformationSystem + .getCommunityCsv(xquery)) { + writer + .write( + community); + writer.write("\n"); - } + } + writer.close(); + } - private void writeCommunity() throws IOException, ISLookUpException, DocumentException, SAXException { - for(String community : queryInformationSystem.getCommunityCsv(IOUtils.toString( - DumpCommunities.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq")))) - { - writer - .write( - community); - writer.write("\n"); + public DumpCommunities(String hdfsPath, String hdfsNameNode, String isLookUpUrl) throws Exception { + final Configuration conf = new Configuration(); + queryInformationSystem = new QueryInformationSystem(); + queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl)); - } - writer.close(); - } + conf.set("fs.defaultFS", hdfsNameNode); + FileSystem fileSystem = FileSystem.get(conf); + Path hdfsWritePath = new Path(hdfsPath); + if (fileSystem.exists(hdfsWritePath)) { + fileSystem.delete(hdfsWritePath, true); + } + FSDataOutputStream fos = fileSystem.create(hdfsWritePath); - public DumpCommunities(String hdfsPath, String hdfsNameNode, String isLookUpUrl) throws Exception { - final Configuration conf = new Configuration(); - queryInformationSystem= new QueryInformationSystem(); - queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl)); - - conf.set("fs.defaultFS", hdfsNameNode); - FileSystem fileSystem = FileSystem.get(conf); - Path hdfsWritePath = new Path(hdfsPath); - - if (fileSystem.exists(hdfsWritePath)) { - fileSystem.delete(hdfsWritePath, true); - } - FSDataOutputStream fos = fileSystem.create(hdfsWritePath); - - writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8)); - - - - } - + writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8)); + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java index c416334..9fd10f1 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java @@ -9,8 +9,6 @@ import java.util.*; import java.util.stream.Collector; import java.util.stream.Collectors; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.spark.SparkConf; @@ -20,10 +18,13 @@ import org.apache.spark.sql.Dataset; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor; import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut; import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; @@ -59,9 +60,6 @@ public class SparkDumpResults implements Serializable { final String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); - final String resultType = parser.get("resultType"); log.info("resultType: {}", resultType); @@ -78,14 +76,14 @@ public class SparkDumpResults implements Serializable { conf, isSparkSessionManaged, spark -> { - Utils.removeOutputDir(spark, outputPath); - run(spark, inputPath, outputPath, inputClazz, resultType, workingPath); + // Utils.removeOutputDir(spark, outputPath); + run(spark, inputPath, inputClazz, resultType, workingPath); }); } - private static void run(SparkSession spark, String inputPath, String outputPath, + private static void run(SparkSession spark, String inputPath, Class inputClazz, String resultType, String workingPath) { Dataset resultIds = spark.read().textFile(workingPath + "/resultIds"); @@ -94,85 +92,104 @@ public class SparkDumpResults implements Serializable { .filter( (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible()); - // map results resultIds .joinWith(results, resultIds.col("value").equalTo(results.col("id"))) + .map((MapFunction, R>) t2 -> t2._2(), Encoders.bean(inputClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingPath + "/" + resultType + "/temp/result"); + + // map results + results = Utils.readPath(spark, workingPath + "/" + resultType + "/temp/result", inputClazz); + results .map( - (MapFunction, CSVResult>) t2 -> mapResultInfo(t2._2()), + (MapFunction) r -> mapResultInfo(r), Encoders.bean(CSVResult.class)) .write() .option("compression", "gzip") - .option("header","true") - .option("delimiter",Constants.SEP) +// .option("header", "true") +// .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) - .csv(workingPath + "/" + resultType + "/result"); + .json(workingPath + "/" + resultType + "/result"); // map relations between pid and result - resultIds - .joinWith(results, resultIds.col("value").equalTo(results.col("id"))) - .flatMap((FlatMapFunction, CSVPid>) t2 -> { + results + .flatMap((FlatMapFunction) r -> { List pids = new ArrayList<>(); - if (Optional.ofNullable(t2._2().getPid()).isPresent() && t2._2().getPid().size() > 0) { - pids.addAll(mapPid(t2._2().getPid(), t2._1())); + if (Optional.ofNullable(r.getPid()).isPresent() && r.getPid().size() > 0) { + pids.addAll(mapPid(r.getPid(), r.getId())); } return pids.iterator(); }, Encoders.bean(CSVPid.class)) .filter(Objects::nonNull) .write() .option("compression", "gzip") - .option("header","true") - .option("delimiter", Constants.SEP) +// .option("header", "true") +// .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) - .csv(workingPath + "/" + resultType + "/result_pid"); + .json(workingPath + "/" + resultType + "/result_pid"); // map authors from the result // per ogni autore nel result // se l'autore ha un orcid il suo id dipende dall'orcid (tipo md5(orcid)) - // se non ha orcid il suo id si costruisce come result_id + authorrank ( se non ha il rank si sua + // se non ha orcid il suo id si costruisce come result_id + authorrank ( se non ha il rank si sua // la sua posizione nell'insieme degli autori) sempre con md5 - Dataset authorResult = resultIds - .joinWith(results, resultIds.col("value").equalTo(results.col("id"))) - .flatMap((FlatMapFunction, AuthorResult>) t2 -> { + results + .flatMap((FlatMapFunction) r -> { int count = 0; List arl = new ArrayList<>(); - for (Author a : t2._2().getAuthor()) { - count += 1; - AuthorResult ar = new AuthorResult(); - ar.setResultId(t2._1()); - if (Optional.ofNullable(a.getRank()).isPresent()) { - if (a.getRank() > 0) { - ar.setRank(String.valueOf(a.getRank())); - } else { - ar.setRank(String.valueOf(count)); + if (Optional.ofNullable(r.getAuthor()).isPresent()) { + for (Author a : r.getAuthor()) { + count += 1; + AuthorResult ar = new AuthorResult(); + ar.setResultId(r.getId()); + if (Optional.ofNullable(a.getRank()).isPresent()) { + if (a.getRank() > 0) { + ar.setRank(String.valueOf(a.getRank())); + } else { + ar.setRank(String.valueOf(count)); + } } + ar.setFirstName(a.getName()); + ar.setLastName(a.getSurname()); + ar.setFullName(a.getFullname()); + Tuple2 orcid = getOrcid(a.getPid()); + if (Optional.ofNullable(orcid).isPresent()) { + ar.setOrcid(orcid._1()); + ar.setFromOrcid(orcid._2()); + } + + ar.autosetId(); + arl.add(ar); } - ar.setFirstName(a.getName()); - ar.setLastName(a.getSurname()); - ar.setFullName(a.getFullname()); - ar.setOrcid(getOrcid(a.getPid())); - ar.autosetId(); - arl.add(ar); } + return arl.iterator(); - }, Encoders.bean(AuthorResult.class)); - + }, Encoders.bean(AuthorResult.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingPath + "/" + resultType + "/temp/authorresult"); + Dataset authorResult = Utils + .readPath(spark, workingPath + "/" + resultType + "/temp/authorresult", AuthorResult.class); // map the relation between author and result authorResult .map( (MapFunction) ar -> { CSVRelResAut ret = new CSVRelResAut(); - ret.setResult_id(ar.getResultId() ); - ret.setAuthor_id( ar.getAuthorId()); + ret.setResult_id(ar.getResultId()); + ret.setAuthor_id(ar.getAuthorId()); return ret; }, Encoders.bean(CSVRelResAut.class)) .write() .option("compression", "gzip") - .option("header","true") - .option("delimiter",Constants.SEP) +// .option("header", "true") +// .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) - .csv(workingPath + "/" + resultType + "/result_author"); + .json(workingPath + "/" + resultType + "/result_author"); // ma the authors in the working dir. I do not want to have them repeated authorResult @@ -182,23 +199,28 @@ public class SparkDumpResults implements Serializable { Encoders.bean(CSVAuthor.class)) .write() .option("compression", "gzip") - .option("header","true") - .option("delimiter",Constants.SEP) +// .option("header", "true") +// .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) - .csv(workingPath + "/" + resultType + "/author"); + .json(workingPath + "/" + resultType + "/author"); } private static List mapPid(List pid, String resultId) { - return pid.stream().map(p -> p.getQualifier().getClassid().toLowerCase() + "@" + p.getValue().toLowerCase()).distinct().map(p -> { - CSVPid ret = new CSVPid(); - ret.setId(DHPUtils.md5(p)); - ret.setResult_id(resultId); - ret.setPid(split(p, "@")[1]); - ret.setType(split(p, "@")[0]); + return pid + .stream() + .map(p -> p.getQualifier().getClassid().toLowerCase() + "@" + p.getValue().toLowerCase()) + .distinct() + .map(p -> { + CSVPid ret = new CSVPid(); + ret.setId(DHPUtils.md5(p)); + ret.setResult_id(resultId); + ret.setPid(split(p, "@")[1]); + ret.setType(split(p, "@")[0]); - return ret; - }).collect(Collectors.toList()); + return ret; + }) + .collect(Collectors.toList()); } @@ -213,6 +235,7 @@ public class SparkDumpResults implements Serializable { if (ar.getOrcid() != null) { ret.setOrcid(ar.getOrcid()); + ret.setFromOrcid(ar.getFromOrcid()); } else { ret.setOrcid(""); } @@ -220,27 +243,33 @@ public class SparkDumpResults implements Serializable { return ret; } - private static String getOrcid(List pid) { + private static Tuple2 getOrcid(List pid) { if (!Optional.ofNullable(pid).isPresent()) return null; if (pid.size() == 0) return null; for (StructuredProperty p : pid) { if (p.getQualifier().getClassid().equals(ModelConstants.ORCID)) { - return p.getValue(); + return new Tuple2<>(p.getValue(), Boolean.TRUE); + } + } + for (StructuredProperty p : pid) { + if (p.getQualifier().getClassid().equals(ModelConstants.ORCID_PENDING)) { + return new Tuple2<>(p.getValue(), Boolean.FALSE); } } return null; } - private static String getFieldValue(Field input){ + private static String getFieldValue(Field input) { if (input != null && - StringUtils.isNotEmpty(input.getValue())) { + StringUtils.isNotEmpty(input.getValue())) { return input.getValue(); } else { return ""; } } + private static CSVResult mapResultInfo(R r) { CSVResult ret = new CSVResult(); ret.setId(r.getId()); @@ -251,16 +280,24 @@ public class SparkDumpResults implements Serializable { ret.setPublication_date(getFieldValue(r.getDateofacceptance())); ret.setPublisher(getFieldValue(r.getPublisher())); - ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> { - if (StringUtils.isNotEmpty(s.getValue())) - return s.getValue().toLowerCase(); - else - return null;}).filter(Objects::nonNull).distinct().collect(Collectors.toList()))); + if (Optional.ofNullable(r.getSubject()).isPresent()) + ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> { + if (StringUtils.isNotEmpty(s.getValue())) + return s.getValue().toLowerCase(); + else + return null; + }).filter(Objects::nonNull).distinct().collect(Collectors.toList()))); + else + ret.setKeywords(""); + if (Optional.ofNullable(r.getCountry()).isPresent()) + ret + .setCountry( + String.join(", ", r.getCountry().stream().map(Country::getClassid).collect(Collectors.toList()))); + else + ret.setCountry(""); - ret.setCountry(String.join(", ", r.getCountry().stream().map(Country::getClassid).collect(Collectors.toList()))); - - if (StringUtils.isNotEmpty(r.getLanguage().getClassid())) { + if (Optional.ofNullable(r.getLanguage()).isPresent() && StringUtils.isNotEmpty(r.getLanguage().getClassid())) { ret.setLanguage(r.getLanguage().getClassid()); } else { ret.setLanguage(""); @@ -270,7 +307,7 @@ public class SparkDumpResults implements Serializable { } private static String getAbstract(List> description) { - if(description == null) + if (description == null) return ""; for (Field abs : description) { if (StringUtils.isNotEmpty(abs.getValue())) { diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkMoveOnSigleDir.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkMoveOnSigleDir.java index 48b3a22..5a41ae8 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkMoveOnSigleDir.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkMoveOnSigleDir.java @@ -1,109 +1,133 @@ + package eu.dnetlib.dhp.oa.graph.dump.csv; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.dump.Utils; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.utils.DHPUtils; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.Optional; + import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.StringUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import scala.Tuple2; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; -import java.util.Optional; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVPid; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRelResAut; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult; +import eu.dnetlib.dhp.schema.oaf.*; /** * @author miriam.baglioni * @Date 10/05/23 */ +//STEP 4 public class SparkMoveOnSigleDir implements Serializable { - //All the products saved in different directories are put under the same one. - // For the authors also a step of reconciliation mast be done, since the same author id can be saved in more that one directory + // All the products saved in different directories are put under the same one. + // For the authors also a step of reconciliation mast be done, since the same author id can be saved in more than + // one directory - private static final Logger log = LoggerFactory.getLogger(SparkMoveOnSigleDir.class); + private static final Logger log = LoggerFactory.getLogger(SparkMoveOnSigleDir.class); - public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - SparkMoveOnSigleDir.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_step2.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkMoveOnSigleDir.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste4.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + SparkConf conf = new SparkConf(); - SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + // Utils.removeOutputDir(spark, outputPath); + run(spark, outputPath, workingPath); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - Utils.removeOutputDir(spark, outputPath ); - run(spark, outputPath, workingPath); + }); - }); + } - } + private static void run(SparkSession spark, String outputPath, + String workingPath) { - private static void run(SparkSession spark, String outputPath, - String workingPath) { + Utils + .readPath(spark, workingPath + "/publication/result", CSVResult.class) + .union(Utils.readPath(spark, workingPath + "/dataset/result", CSVResult.class)) + .union(Utils.readPath(spark, workingPath + "/software/result", CSVResult.class)) + .union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result", CSVResult.class)) + .write() + .mode(SaveMode.Overwrite) + .option("header", "true") + .option("delimiter", Constants.SEP) + .option("compression", "gzip") + .csv(outputPath + "/result"); - spark.read().textFile(workingPath + "/publication/result", workingPath + "/dataset/result", workingPath + "/software/result", workingPath + "/otherresearchproduct/result") - .write() - .mode(SaveMode.Overwrite) - .csv(outputPath + "/result"); + Utils + .readPath(spark, workingPath + "/publication/result_pid", CSVPid.class) + .union(Utils.readPath(spark, workingPath + "/dataset/result_pid", CSVPid.class)) + .union(Utils.readPath(spark, workingPath + "/software/result_pid", CSVPid.class)) + .union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result_pid", CSVPid.class)) + .write() + .mode(SaveMode.Overwrite) + .option("header", "true") + .option("delimiter", Constants.SEP) + .option("compression", "gzip") + .csv(outputPath + "/result_pid"); - spark.read().textFile(workingPath + "/publication/result_pid", workingPath + "/dataset/result_pid", workingPath + "/software/result_pid", workingPath + "/otherresearchproduct/result_pid") - .write() - .mode(SaveMode.Overwrite) - .csv(outputPath + "/result_pid"); + Utils + .readPath(spark, workingPath + "/publication/result_author", CSVRelResAut.class) + .union(Utils.readPath(spark, workingPath + "/dataset/result_author", CSVRelResAut.class)) + .union(Utils.readPath(spark, workingPath + "/software/result_author", CSVRelResAut.class)) + .union(Utils.readPath(spark, workingPath + "/otherresearchproduct/result_author", CSVRelResAut.class)) + .write() + .mode(SaveMode.Overwrite) + .option("header", "true") + .option("delimiter", Constants.SEP) + .option("compression", "gzip") + .csv(outputPath + "/result_author"); + Utils + .readPath(spark, workingPath + "/publication/author", CSVAuthor.class) + .union(Utils.readPath(spark, workingPath + "/dataset/author", CSVAuthor.class)) + .union(Utils.readPath(spark, workingPath + "/software/author", CSVAuthor.class)) + .union(Utils.readPath(spark, workingPath + "/otherresearchproduct/author", CSVAuthor.class)) + .groupByKey((MapFunction) r -> r.getId(), Encoders.STRING()) + .mapGroups( + (MapGroupsFunction) (k, it) -> it.next(), Encoders.bean(CSVAuthor.class)) + .write() + .mode(SaveMode.Overwrite) + .option("header", "true") + .option("delimiter", Constants.SEP) + .option("compression", "gzip") + .csv(outputPath + "/author"); - spark.read().textFile(workingPath + "/publication/result_author", workingPath + "/dataset/result_author", workingPath + "/software/result_author", workingPath + "/otherresearchproduct/result_author") - .write() - .mode(SaveMode.Overwrite) - .csv(outputPath + "/result_author"); + } - - spark.read().textFile(workingPath + "/publication/result_author", workingPath + "/dataset/result_author", workingPath + "/software/result_author", workingPath + "/otherresearchproduct/result_author") - .groupByKey((MapFunction) a -> a.split("\t")[0], Encoders.STRING()) - .mapGroups((MapGroupsFunction) (k, it) -> it.next(), Encoders.STRING() ) - .write() - .mode(SaveMode.Overwrite) - .csv(outputPath + "/author"); - - - } - - } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java index 094264e..522ce8b 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java @@ -4,10 +4,7 @@ package eu.dnetlib.dhp.oa.graph.dump.csv; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; +import java.util.*; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -40,6 +37,7 @@ public class SparkSelectResultsAndDumpRelations implements Serializable { private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class); private static String RESULT_COMMUNITY_TABLE = "/result_community"; private static String COMMUNITY_RESULT_IDS = "/communityResultIds"; + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -77,7 +75,7 @@ public class SparkSelectResultsAndDumpRelations implements Serializable { conf, isSparkSessionManaged, spark -> { - Utils.removeOutputDir(spark, outputPath); + Utils.removeOutputDir(spark, outputPath + RESULT_COMMUNITY_TABLE); run(spark, inputPath, outputPath, workingPath, finalCommunityList); }); @@ -99,7 +97,6 @@ public class SparkSelectResultsAndDumpRelations implements Serializable { spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList, workingPath + COMMUNITY_RESULT_IDS); - // write the relations result communities writeCommunityResultRelations( spark, inputPath + "/publication", Publication.class, communityList, outputPath + RESULT_COMMUNITY_TABLE); @@ -109,7 +106,7 @@ public class SparkSelectResultsAndDumpRelations implements Serializable { spark, inputPath + "/software", Software.class, communityList, outputPath + RESULT_COMMUNITY_TABLE); writeCommunityResultRelations( spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, communityList, - outputPath + RESULT_COMMUNITY_TABLE); + outputPath + RESULT_COMMUNITY_TABLE); // select the relations with semantics cites org.apache.spark.sql.Dataset relations = Utils @@ -148,8 +145,8 @@ public class SparkSelectResultsAndDumpRelations implements Serializable { Encoders.bean(CSVCitation.class)) .write() .option("compression", "gzip") - .option("header","true") - .option("delimiter", Constants.SEP) + .option("header", "true") + .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) .csv(outputPath + "/relation"); @@ -171,17 +168,24 @@ public class SparkSelectResultsAndDumpRelations implements Serializable { (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && !p.getDataInfo().getInvisible()) .flatMap((FlatMapFunction) p -> { + Set inserted = new HashSet<>(); List ret = new ArrayList<>(); - for (String context :p.getContext().stream().map(Context::getId).distinct().collect(Collectors.toList())) { + for (String context : p + .getContext() + .stream() + .map(Context::getId) + .distinct() + .collect(Collectors.toList())) { String cId = context.contains("::") ? context.substring(0, context.indexOf("::")) : context; - if (communityList.contains(cId)) { + if (communityList.contains(cId) && !inserted.contains(cId)) { CSVRELCommunityResult crc = new CSVRELCommunityResult(); crc.setResult_id(p.getId()); crc.setCommunity_id(DHPUtils.md5(cId)); ret.add(crc); + inserted.add(cId); } } return ret.iterator(); @@ -189,8 +193,8 @@ public class SparkSelectResultsAndDumpRelations implements Serializable { .write() .option("compression", "gzip") .mode(SaveMode.Append) - .option("header","true") - .option("delimiter",Constants.SEP) + .option("header", "true") + .option("delimiter", Constants.SEP) .csv(outputPath); } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java index a3188f4..101ce33 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java @@ -13,6 +13,15 @@ public class CSVAuthor implements Serializable { private String lastname; private String fullname; private String orcid; + private Boolean fromOrcid; + + public Boolean getFromOrcid() { + return fromOrcid; + } + + public void setFromOrcid(Boolean fromOrcid) { + this.fromOrcid = fromOrcid; + } public String getId() { return id; diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml index 1606d6e..42fa50a 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + sourcePath @@ -65,22 +65,27 @@ - + - - - - - - + + + eu.dnetlib.dhp.oa.graph.dump.csv.DumpCommunities + --outputPath${outputPath}/community + --nameNode${nameNode} + --isLookUpUrl${isLookUpUrl} + --communities${communities} + + + + - + yarn cluster - select results from publication - eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults + select results ids connected to communities and dump relation + eu.dnetlib.dhp.oa.graph.dump.csv.SparkSelectResultsAndDumpRelations dump-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -93,15 +98,50 @@ --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} --sourcePath${sourcePath} - --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir} + --workingPath${outputPath}/workingDir + --outputPath${outputPath} --communities${communities} + + + + + + + + + + + + + + + + yarn + cluster + select results from publication + eu.dnetlib.dhp.oa.graph.dump.csv.SparkDumpResults + dump-${projectVersion}.jar + + --executor-memory=9G + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath} + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + + --workingPath${outputPath}/workingDir --resultTypepublication - + yarn cluster @@ -120,14 +160,14 @@ --sourcePath${sourcePath} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir} - --communities${communities} + + --workingPath${outputPath}/workingDir --resultTypedataset - + yarn cluster @@ -146,14 +186,14 @@ --sourcePath${sourcePath} --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir} - --communities${communities} + + --workingPath${outputPath}/workingDir --resultTypeotherresearchproduct - + yarn cluster @@ -172,24 +212,25 @@ --sourcePath${sourcePath} --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir} - --communities${communities} + + --workingPath${outputPath}/workingDir --resultTypesoftware + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + yarn cluster - Dump table project - eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob + Dump single results + eu.dnetlib.dhp.oa.graph.dump.csv.SparkMoveOnSigleDir dump-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -201,38 +242,15 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - --sourcePath${sourcePath}/project - --resultTableNameeu.dnetlib.dhp.schema.oaf.Project - --outputPath${workingDir}/project - --communityMapPathnoneed + --workingPath${outputPath}/workingDir + + --outputPath${outputPath} + - - - - - - yarn - cluster - Dump table project - eu.dnetlib.dhp.oa.graph.dump.projectssubset.ProjectsSubsetSparkJob - dump-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} - - --sourcePath${workingDir}/project - --outputPath${workingDir}/tar/project - --projectListPath${projectListPath} - - + + eu.dnetlib.dhp.oa.graph.dump.MakeTar diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste1.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste1.json new file mode 100644 index 0000000..2f89c84 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste1.json @@ -0,0 +1,30 @@ +[ + + + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + { + "paramName": "nn", + "paramLongName": "nameNode", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": true + }, + + { + "paramName":"ilu", + "paramLongName":"isLookUpUrl", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + { + "paramName":"c", + "paramLongName":"communities", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + } +] + diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json index 87a01d8..1aceb18 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste3.json @@ -6,12 +6,7 @@ "paramDescription": "the path of the sequencial file to read", "paramRequired": true }, - { - "paramName": "out", - "paramLongName": "outputPath", - "paramDescription": "the path used to store temporary output files", - "paramRequired": true - }, + { "paramName": "ssm", "paramLongName": "isSparkSessionManaged", diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste4.json b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste4.json new file mode 100644 index 0000000..706e7e9 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste4.json @@ -0,0 +1,25 @@ +[ + + + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName":"wp", + "paramLongName":"workingPath", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + }, + + { + "paramName":"o", + "paramLongName":"outputPath", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + } +] + + diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq index 7fad824..7b470ca 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/xqueries/set_of_communities.xq @@ -1,6 +1,6 @@ for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] -and ($x//CONFIGURATION/context[./@id='dh-ch'] or $x//CONFIGURATION/context[./@id='dariah'] or $x//CONFIGURATION/context[./@id='enermaps'] or $x//CONFIGURATION/context[./@id='beopen']) +and (%s) return {$x//CONFIGURATION/context/@id} diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java index dba7b94..9ef6ea1 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java @@ -1,15 +1,26 @@ package eu.dnetlib.dhp.oa.graph.dump.csv; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.utils.DHPUtils; +import static org.apache.commons.lang3.StringUtils.split; + +import java.io.IOException; +import java.io.StringReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Optional; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.Node; +import org.dom4j.io.SAXReader; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; @@ -17,253 +28,310 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.HashMap; -import java.util.Optional; +import com.fasterxml.jackson.databind.ObjectMapper; -import static org.apache.commons.lang3.StringUtils.split; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.utils.DHPUtils; /** * @author miriam.baglioni * @Date 11/05/23 */ public class DumpResultTest { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static final Logger log = LoggerFactory - .getLogger(DumpResultTest.class); + private static final Logger log = LoggerFactory + .getLogger(DumpResultTest.class); - private static HashMap map = new HashMap<>(); + private static HashMap map = new HashMap<>(); - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files - .createTempDirectory(DumpResultTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files + .createTempDirectory(DumpResultTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(DumpResultTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(DumpResultTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = SparkSession - .builder() - .appName(DumpResultTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(DumpResultTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testDumpResult() throws Exception { + @Test + public void testDumpResult() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") - .getPath(); + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); - spark.read().text(getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") - .getPath()) - .write() - .text(workingDir.toString() + "/working/resultIds/"); + spark + .read() + .text( + getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); - SparkDumpResults.main(new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-outputPath", workingDir.toString() + "/output", - "-workingPath", workingDir.toString() + "/working", - "-resultType", "publication", - "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", - "-sourcePath", sourcePath - }); + SparkDumpResults.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-resultType", "publication", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-sourcePath", sourcePath + }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + Dataset tmp = spark + .read() + .option("header", "true") + .option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/working/publication/result"); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + Assertions.assertEquals(3, tmp.count()); + Row row = tmp + .where("id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'") + .first(); + Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright")); + Assertions.assertEquals("FI", row.getAs("country")); + Assertions.assertEquals("Lit.opg., bijl.", row.getAs("description")); + Assertions.assertEquals(3, split(row.getAs("keywords"), ", ").length); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); + Assertions.assertTrue(row.getAs("keywords").toString().contains("prospectie")); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); + Assertions.assertEquals("nl", row.getAs("language")); + Assertions.assertEquals("2007-01-01", row.getAs("publication_date")); + Assertions.assertEquals("FakePublisher1", row.getAs("publisher")); + Assertions + .assertEquals( + "Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel", + row.getAs("title")); + Assertions.assertEquals("publication", row.getAs("type")); - Dataset tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP) - .csv(workingDir.toString() + "/working/publication/result"); + row = tmp + .where("id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'") + .first(); + Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright")); + Assertions.assertEquals(2, split(row.getAs("country"), ", ").length); + Assertions.assertNull(row.getAs("description")); + Assertions.assertEquals(2, split(row.getAs("keywords"), ", ").length); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); + Assertions.assertEquals("UNKNOWN", row.getAs("language")); + Assertions.assertNull(row.getAs("publication_date")); + Assertions.assertNull(row.getAs("publisher")); + Assertions.assertEquals("None", row.getAs("title")); + Assertions.assertEquals("publication", row.getAs("type")); - Assertions.assertEquals(3, tmp.count()); - Row row = tmp - .where("id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'") - .first(); - Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright")); - Assertions.assertEquals("FI" ,row.getAs("country")); - Assertions.assertEquals("Lit.opg., bijl." ,row.getAs("description")); - Assertions.assertEquals(3 ,split(row.getAs("keywords"), ", ").length); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); - Assertions.assertTrue(row.getAs("keywords").toString().contains("prospectie")); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); - Assertions.assertEquals("nl", row.getAs("language")); - Assertions.assertEquals("2007-01-01", row.getAs("publication_date")); - Assertions.assertEquals("FakePublisher1", row.getAs("publisher")); - Assertions.assertEquals("Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel", row.getAs("title")); - Assertions.assertEquals("publication", row.getAs("type")); + row = tmp + .where("id = '50|DansKnawCris::26780065282e607306372abd0d808245'") + .first(); + Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright")); + Assertions.assertNull(row.getAs("country")); + Assertions.assertNull(row.getAs("description")); + Assertions.assertEquals(2, split(row.getAs("keywords"), ", ").length); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); + Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); + Assertions.assertEquals("UNKNOWN", row.getAs("language")); + Assertions.assertNull(row.getAs("publication_date")); + Assertions.assertNull(row.getAs("publisher")); + Assertions.assertEquals("None", row.getAs("title")); + Assertions.assertEquals("publication", row.getAs("type")); - row = tmp - .where("id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'") - .first(); - Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright")); - Assertions.assertEquals(2 ,split(row.getAs("country"), ", ").length); - Assertions.assertNull(row.getAs("description")); - Assertions.assertEquals(2 ,split(row.getAs("keywords"), ", ").length); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); - Assertions.assertEquals("UNKNOWN", row.getAs("language")); - Assertions.assertNull( row.getAs("publication_date")); - Assertions.assertNull( row.getAs("publisher")); - Assertions.assertEquals("None", row.getAs("title")); - Assertions.assertEquals("publication", row.getAs("type")); + } - row = tmp - .where("id = '50|DansKnawCris::26780065282e607306372abd0d808245'") - .first(); - Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(),row.getAs("accessright")); - Assertions.assertNull(row.getAs("country")); - Assertions.assertNull(row.getAs("description")); - Assertions.assertEquals(2 ,split(row.getAs("keywords"), ", ").length); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); - Assertions.assertEquals("UNKNOWN", row.getAs("language")); - Assertions.assertNull( row.getAs("publication_date")); - Assertions.assertNull( row.getAs("publisher")); - Assertions.assertEquals("None", row.getAs("title")); - Assertions.assertEquals("publication", row.getAs("type")); + @Test + public void testDumpAuthor() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); - } + spark + .read() + .text( + getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); - @Test - public void testDumpAuthor() throws Exception { + SparkDumpResults.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-resultType", "publication", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-sourcePath", sourcePath + }); - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") - .getPath(); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - spark.read().text(getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") - .getPath()) - .write() - .text(workingDir.toString() + "/working/resultIds/"); + Dataset tmp = spark + .read() + .option("header", "true") + .option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/working/publication/author"); - SparkDumpResults.main(new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-outputPath", workingDir.toString() + "/output", - "-workingPath", workingDir.toString() + "/working", - "-resultType", "publication", - "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", - "-sourcePath", sourcePath - }); + Assertions.assertEquals(5, tmp.count()); + Assertions.assertEquals(1, tmp.where("firstName == 'Maryam'").count()); + Assertions + .assertEquals( + DHPUtils.md5("50|DansKnawCris::0224aae28af558f21768dbc6439c7a951"), + tmp.where("firstName == 'Maryam'").first().getAs("id")); + Assertions + .assertEquals(DHPUtils.md5("0000-0003-2914-2734"), tmp.where("firstName == 'Michael'").first().getAs("id")); + Assertions + .assertEquals( + DHPUtils.md5("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d92"), + tmp.where("firstName == 'Mikhail'").first().getAs("id")); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } - Dataset tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP) - .csv(workingDir.toString() + "/working/publication/author"); + @Test + public void testDumpResultAuthorRelations() throws Exception { - Assertions.assertEquals(5, tmp.count()); + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); - Assertions.assertEquals(1,tmp.where("firstName == 'Maryam'").count()); + spark + .read() + .text( + getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); - Assertions.assertEquals(DHPUtils.md5("50|DansKnawCris::0224aae28af558f21768dbc6439c7a951"),tmp.where("firstName == 'Maryam'").first().getAs("id")); - Assertions.assertEquals(DHPUtils.md5("0000-0003-2914-2734"),tmp.where("firstName == 'Michael'").first().getAs("id")); - Assertions.assertEquals(DHPUtils.md5("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d92"),tmp.where("firstName == 'Mikhail'").first().getAs("id")); + SparkDumpResults.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-resultType", "publication", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-sourcePath", sourcePath + }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } + Dataset tmp = spark + .read() + .option("header", "true") + .option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/working/publication/result_author"); - @Test - public void testDumpResultAuthorRelations() throws Exception { + Assertions.assertEquals(6, tmp.count()); - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") - .getPath(); + Assertions.assertEquals(2, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'").count()); + Assertions + .assertEquals( + 1, tmp + .where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'") + .where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'") + .count()); + Assertions + .assertEquals( + 1, tmp + .where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'") + .where("result_id == '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'") + .count()); - spark.read().text(getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") - .getPath()) - .write() - .text(workingDir.toString() + "/working/resultIds/"); + } - SparkDumpResults.main(new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-outputPath", workingDir.toString() + "/output", - "-workingPath", workingDir.toString() + "/working", - "-resultType", "publication", - "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", - "-sourcePath", sourcePath - }); + @Test + public void testDumpResultPid() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") + .getPath(); + spark + .read() + .text( + getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + SparkDumpResults.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", + "-resultType", "publication", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-sourcePath", sourcePath + }); - Dataset tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP) - .csv(workingDir.toString() + "/working/publication/result_author"); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - Assertions.assertEquals(6, tmp.count()); + Dataset tmp = spark + .read() + .option("header", "true") + .option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/working/publication/result_pid"); - Assertions.assertEquals(2, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'").count()); - Assertions.assertEquals(1, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'") - .where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'").count()); - Assertions.assertEquals(1, tmp.where("author_id == '" + DHPUtils.md5("0000-0003-2914-2734") + "'") - .where("result_id == '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'").count()); + tmp.show(false); + Assertions.assertEquals(4, tmp.count()); + Assertions + .assertEquals(2, tmp.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'").count()); + Assertions + .assertEquals( + "10.1023/fakedoi", + tmp + .where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' and type == 'doi'") + .first() + .getAs("pid")); - } + } - @Test - public void testDumpResultPid() throws Exception { + @Test + public void prova() throws DocumentException { + String input = "" + + " This community gathers research results, data, scientific publications and projects related to the domain of Digital Humanities. This broad definition includes Humanities, Cultural Heritage, History, Archaeology and related fields." + + + ""; - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/input/") - .getPath(); + final Document doc; + final SAXReader reader = new SAXReader(); - spark.read().text(getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") - .getPath()) - .write() - .text(workingDir.toString() + "/working/resultIds/"); - - SparkDumpResults.main(new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-outputPath", workingDir.toString() + "/output", - "-workingPath", workingDir.toString() + "/working", - "-resultType", "publication", - "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", - "-sourcePath", sourcePath - }); - - - - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - Dataset tmp = spark.read().option("header", "true").option("delimiter", Constants.SEP) - .csv(workingDir.toString() + "/working/publication/result_pid"); - - tmp.show(false); - Assertions.assertEquals(4, tmp.count()); - - Assertions.assertEquals(2, tmp.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'").count()); - Assertions.assertEquals("10.1023/fakedoi", tmp.where("result_id == '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' and type == 'doi'").first().getAs("pid")); - - - } + doc = reader.read(new StringReader(input)); + Element root = doc.getRootElement(); + StringBuilder builder = new StringBuilder(); + builder.append(DHPUtils.md5(root.attribute("id").getValue())); + builder.append(Constants.SEP); + builder.append(root.attribute("label").getValue()); + builder.append(Constants.SEP); + builder.append(root.attribute("id").getValue()); + builder.append(Constants.SEP); + builder.append(((Node) (root.selectNodes("//description").get(0))).getText()); + System.out.println(builder.toString()); + } } diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/MoveOnSingleDirTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/MoveOnSingleDirTest.java new file mode 100644 index 0000000..dd8efca --- /dev/null +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/MoveOnSingleDirTest.java @@ -0,0 +1,117 @@ + +package eu.dnetlib.dhp.oa.graph.dump.csv; + +import static org.apache.commons.lang3.StringUtils.split; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.common.ModelConstants; + +/** + * @author miriam.baglioni + * @Date 25/05/23 + */ +public class MoveOnSingleDirTest { + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + private static final Logger log = LoggerFactory + .getLogger(MoveOnSingleDirTest.class); + + private static HashMap map = new HashMap<>(); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files + .createTempDirectory(MoveOnSingleDirTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(MoveOnSingleDirTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(MoveOnSingleDirTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void testDMoveSingleDir() throws Exception { + + final String workingPath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working") + .getPath(); + + spark + .read() + .text( + getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds") + .getPath()) + .write() + .text(workingDir.toString() + "/working/resultIds/"); + + SparkMoveOnSigleDir.main(new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingPath + }); + + Dataset tmp = spark + .read() + .option("header", "true") + .option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/output/result"); + + Assertions.assertEquals(21, tmp.count()); + Assertions.assertEquals(12, tmp.filter("type == 'dataset'").count()); + Assertions.assertEquals(4, tmp.filter("type == 'other'").count()); + Assertions.assertEquals(4, tmp.filter("type == 'publication'").count()); + Assertions.assertEquals(1, tmp.filter("type == 'software'").count()); + + Assertions + .assertEquals( + 8, spark + .read() + .option("header", "true") + .option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/output/author") + .count()); + + } +} diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java index 767267c..a4eed68 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/SelectResultAndDumpRelationTest.java @@ -6,8 +6,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; -import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult; -import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -26,6 +24,8 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVRELCommunityResult; +import eu.dnetlib.dhp.utils.DHPUtils; /** * @author miriam.baglioni @@ -88,70 +88,134 @@ public class SelectResultAndDumpRelationTest { "-sourcePath", sourcePath }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + Assertions.assertEquals(2, sc.textFile(workingDir.toString() + "/working/communityResultIds").count()); + Assertions + .assertEquals( + 1, sc + .textFile(workingDir.toString() + "/working/communityResultIds") + .filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) + .count()); - Assertions.assertEquals(2,sc.textFile(workingDir.toString() + "/working/communityResultIds").count()); + Assertions + .assertEquals( + 1, sc + .textFile(workingDir.toString() + "/working/communityResultIds") + .filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")) + .count()); - Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/communityResultIds") - .filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")).count()); - - Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/communityResultIds") - .filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).count()); - - - //verify that the association is correct with the communityid and result id - spark.read().option("header", "true").option("delimiter",Constants.SEP).csv(workingDir.toString() + "/output/result_community") - .createOrReplaceTempView("result_community"); + // verify that the association is correct with the communityid and result id + spark + .read() + .option("header", "true") + .option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/output/result_community") + .createOrReplaceTempView("result_community"); Assertions.assertEquals(3, spark.sql("SELECT * FROM result_community").count()); - Assertions.assertEquals(1, spark.sql("SELECT * " + - "FROM result_community " + - "WHERE community_id = '" + DHPUtils.md5("dh-ch") + "'").count()); + Assertions + .assertEquals( + 1, spark + .sql( + "SELECT * " + + "FROM result_community " + + "WHERE community_id = '" + DHPUtils.md5("dh-ch") + "'") + .count()); - Assertions.assertEquals(1, spark.sql("SELECT * " + - "FROM result_community" + - " WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " + - "AND community_id = '" + DHPUtils.md5("dh-ch") + "'").count()); - - Assertions.assertEquals(2, spark.sql("SELECT * " + - "FROM result_community " + - "WHERE community_id = '" + DHPUtils.md5("enermaps") + "'").count()); - Assertions.assertEquals(1, spark.sql("SELECT * " + - "FROM result_community " + - "WHERE result_id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9' " + - "AND community_id = '" + DHPUtils.md5("enermaps") + "'").count()); - Assertions.assertEquals(1, spark.sql("SELECT * " + - "FROM result_community " + - "WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " + - "AND community_id = '" + DHPUtils.md5("enermaps") + "'").count()); + Assertions + .assertEquals( + 1, spark + .sql( + "SELECT * " + + "FROM result_community" + + " WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " + + "AND community_id = '" + DHPUtils.md5("dh-ch") + "'") + .count()); + Assertions + .assertEquals( + 2, spark + .sql( + "SELECT * " + + "FROM result_community " + + "WHERE community_id = '" + DHPUtils.md5("enermaps") + "'") + .count()); + Assertions + .assertEquals( + 1, spark + .sql( + "SELECT * " + + "FROM result_community " + + "WHERE result_id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9' " + + "AND community_id = '" + DHPUtils.md5("enermaps") + "'") + .count()); + Assertions + .assertEquals( + 1, spark + .sql( + "SELECT * " + + "FROM result_community " + + "WHERE result_id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95' " + + "AND community_id = '" + DHPUtils.md5("enermaps") + "'") + .count()); Assertions.assertEquals(3, spark.read().textFile(workingDir.toString() + "/working/resultIds").count()); - Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds") - .filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")).count()); + Assertions + .assertEquals( + 1, sc + .textFile(workingDir.toString() + "/working/resultIds") + .filter(v -> v.equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) + .count()); - Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds") - .filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).count()); + Assertions + .assertEquals( + 1, sc + .textFile(workingDir.toString() + "/working/resultIds") + .filter(v -> v.equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")) + .count()); - Assertions.assertEquals(1, sc.textFile(workingDir.toString() + "/working/resultIds") - .filter(v -> v.equals("50|DansKnawCris::26780065282e607306372abd0d808245")).count()); + Assertions + .assertEquals( + 1, sc + .textFile(workingDir.toString() + "/working/resultIds") + .filter(v -> v.equals("50|DansKnawCris::26780065282e607306372abd0d808245")) + .count()); - spark.read().option("header", "true").option("delimiter",Constants.SEP).csv(workingDir.toString() + "/output/relation") - .createOrReplaceTempView("relation"); + spark + .read() + .option("header", "true") + .option("delimiter", Constants.SEP) + .csv(workingDir.toString() + "/output/relation") + .createOrReplaceTempView("relation"); Assertions.assertEquals(2, spark.sql("SELECT * FROM relation").count()); - Assertions.assertEquals(1, spark.sql("SELECT * FROM relation WHERE id = '" + - DHPUtils.md5(("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::26780065282e607306372abd0d808245")) + "'").count()); + Assertions + .assertEquals( + 1, spark + .sql( + "SELECT * FROM relation WHERE id = '" + + DHPUtils + .md5( + ("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::26780065282e607306372abd0d808245")) + + "'") + .count()); - Assertions.assertEquals(1, spark.sql("SELECT * FROM relation WHERE id = '" + - DHPUtils.md5(("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) + "'").count()); + Assertions + .assertEquals( + 1, spark + .sql( + "SELECT * FROM relation WHERE id = '" + + DHPUtils + .md5( + ("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9cites50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) + + "'") + .count()); - - } + } } diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/author/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/author/part0 new file mode 100644 index 0000000..e66ef9c --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/author/part0 @@ -0,0 +1,4 @@ +{"fullname":"Giovanni Aloisio","id":"5ac035663df4d9099cf92d0e3f22a964","orcid":""} +{"fullname":"Cosimo Palazzo","id":"9f0d3123b6390dd7b2f3cee66c6bc926","orcid":""} +{"firstname":"L","fullname":"L, Issel-Tarver","id":"bafb7637b5f1c692419e55b13bf719a3","lastname":"Issel-Tarver","orcid":""} +{"firstname":"Voula","fullname":"Giouli, Voula","id":"c80f55a9afb32ffc4bc6bb67b6e0df33","lastname":"Giouli","orcid":""} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result/part0 new file mode 100644 index 0000000..077a321 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result/part0 @@ -0,0 +1,12 @@ +{"accessright":"UNKNOWN","country":"","description":"Absidiole NE_face ext","id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816","keywords":"chevet, fenêtre, façade","language":"und","publication_date":"2019-01-01","publisher":"Nakala by Huma-Num","title":"QS83_17_Absidiole NE_face ext.jpg","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::0676bf8b1f33afc121ac4f28e1c3d8ad","keywords":"kiu38; http://sith.huma-num.fr/karnak/38","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 69534. Karnak, KIU 38 / stèle d’enceinte de ramsès iii XXe dynastie / Ramses III","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::0b92f33d78d42f54084145b91500941a","keywords":"kiu2869; http://sith.huma-num.fr/karnak/2869","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 8263. Karnak, KIU 2869 / Cour à portique de Thoutmosis IV, Scene, piliers, pilier 03 est : accolade XVIIIe dynastie / Thoutmosis IV","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::157349520d61226da5d85e0856bdae3e","keywords":"kiu4635; http://sith.huma-num.fr/karnak/4635","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 171030. Karnak, KIU 4635 / Cour nord du IVe pylône porte sud-est, face nord, montants est XVIIIe dynastie / Thoutmosis III","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::18b2aa2b1b9a2a11da84bc8e1f662070","keywords":"kiu4225; http://sith.huma-num.fr/karnak/4225, kiu4217; http://sith.huma-num.fr/karnak/4217, kiu4218; http://sith.huma-num.fr/karnak/4218","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 151603. Karnak, KIU 4217 / Temple d’Opet, Soubassement, face extérieure est, soubassement, 1er registre sud 10.n (opet 213 gauche) Romains / Auguste","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::31f713b5670d801de154453ea68ff4e1","keywords":"kiu3479; http://sith.huma-num.fr/karnak/3479","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 198480. Karnak, KIU 3479 / VIe pylône, Scene, mur intérieur est, partie nord 3.s annales (vi) : XVIIIe dynastie / Thoutmosis III","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::358b383fd0975b292edafd6b1d1fe9a2","keywords":"op179; http://sith.huma-num.fr/karnak/op179, kiu1114; http://sith.huma-num.fr/karnak/1114","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 135670. Karnak, KIU 1114 / Temple de Ptah, Objet, objet(s) découvert(s) porte de grenier XVIIe dynastie / SenakhtenRe","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::4cc834e3443c27cb7b0100a470c5c7f9","keywords":"kiu7329; http://sith.huma-num.fr/karnak/7329, kiu7330; http://sith.huma-num.fr/karnak/7330, kiu7331; http://sith.huma-num.fr/karnak/7331","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 169666. Karnak, KIU 7330 / Salle hypostyle colonnes, côté sud, colonne 017, fût frise XXe dynastie / Ramses IV","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::516950eba1c6737cbe26a52401b3fb2c","keywords":"kiu2185; http://sith.huma-num.fr/karnak/2185","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 128938. Karnak, KIU 2185 / « Magasin pur » de Khonsou, Objet porte fragmentaire du « magasin pur » de khonsou Ptolemees / Ptolemee Evergete Ier","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::71182950600db8e6aff20566f9df0345","keywords":"kiu4212; http://sith.huma-num.fr/karnak/4212","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 151470. Karnak, KIU 4212 / Temple d’Opet, Scene, face extérieure est, soubassement, 1er registre sud 04.n (opet 210 gauche) : procession de nils Romains / Auguste","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::99592f6a6bc8b9b67b0d8f1612e310bb","keywords":"kiu3939; http://sith.huma-num.fr/karnak/3939, kiu3822; http://sith.huma-num.fr/karnak/3822, kiu3823; http://sith.huma-num.fr/karnak/3823, kiu3825; http://sith.huma-num.fr/karnak/3825","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 141190. Karnak, KIU 3939 / Temple d’Opet face extérieure sud, soubassement, 1er registre bandeau (opet 266-267) Romains / Auguste","type":"dataset"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|r3f5b9831893::abd19ac4153416d0eb73b8f2e7612d35","keywords":"kiu5592; http://sith.huma-num.fr/karnak/5592, kiu8128; http://sith.huma-num.fr/karnak/8128, kiu8129; http://sith.huma-num.fr/karnak/8129, kiu8130; http://sith.huma-num.fr/karnak/8130","language":"und","publication_date":"","publisher":"Nakala by Huma-Num","title":"CNRS-CFEETK 167789. Karnak","type":"dataset"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result_author/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result_author/part0 new file mode 100644 index 0000000..12baaf5 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result_author/part0 @@ -0,0 +1,19 @@ +{"author_id":"54ecb1d939e05ac0542d6af377100e67","result_id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816"} +{"author_id":"06706770e1fb3b89fea4d0a8a60e7809","result_id":"50|r3f5b9831893::0b92f33d78d42f54084145b91500941a"} +{"author_id":"3afe02a6563ca7c30df007d69645f730","result_id":"50|r3f5b9831893::18b2aa2b1b9a2a11da84bc8e1f662070"} +{"author_id":"440464bc227f8371c905779a4641d49a","result_id":"50|r3f5b9831893::31f713b5670d801de154453ea68ff4e1"} +{"author_id":"3d0c4aa051cdc1cc71907a973f616767","result_id":"50|r3f5b9831893::4cc834e3443c27cb7b0100a470c5c7f9"} +{"author_id":"874398e3c71ba2e8cf76de4ba458d5fb","result_id":"50|r3f5b9831893::516950eba1c6737cbe26a52401b3fb2c"} +{"author_id":"fe165c3a039f1cc4301c9dbd7c7f2247","result_id":"50|r3f5b9831893::71182950600db8e6aff20566f9df0345"} +{"author_id":"b3b2b99a02b1bbd8d4b5a1994b8d60fe","result_id":"50|r3f5b9831893::99592f6a6bc8b9b67b0d8f1612e310bb"} +{"author_id":"be12aee5482275608067a3cab9e8beb6","result_id":"50|r3f5b9831893::abd19ac4153416d0eb73b8f2e7612d35"} +{"author_id":"dde164aefcd3aebafec84feedd999170","result_id":"50|r3f5b9831893::b26848c3000fbd7153e2fdeaf3d70bd2"} +{"author_id":"3a55a188e8a23e645752055ff18d4720","result_id":"50|r3f5b9831893::b94d49cfb4ea230b784be1fe24f0edd5"} +{"author_id":"a0bcddc2a41a4cc0dd768eced4dd0939","result_id":"50|r3f5b9831893::ef9f1724cef04a9f62bdf90d9084d70b"} +{"author_id":"51b2a67f20cdfd9628233ebf04158468","result_id":"50|r3f5b9831893::f349ea5bdd91d846e70e6a4a3c71ccd6"} +{"author_id":"dfad2f4b741f4fbac8f504dd0088db06","result_id":"50|r3f5b9831893::f82af1f6dfd2b8644ba3ab799285849f"} +{"author_id":"b52f90003de8e73f2f704ced12b83bba","result_id":"50|r3f5b9831893::fb7cf14ef55474c3b745262fea21d4c0"} +{"author_id":"08e7328f7c44b32e1203374aadbedf0c","result_id":"50|doi_dedup___::c7a29e095e1763e09af2eb0e2ffbb717"} +{"author_id":"c8c6c6273e798cf408f848afd8ca13f8","result_id":"50|r3f5b9831893::0bc48082a3803d837098447a4f8fb28d"} +{"author_id":"16d0306f0af215d9ec8f70660026d585","result_id":"50|r3f5b9831893::1a372b7640db956b13716fc5e7b455b7"} +{"author_id":"c0a97e8f55967dedb4a57125e3174816","result_id":"50|r3f5b9831893::1b8dec9230423314146858112059845d"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result_pid/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result_pid/part0 new file mode 100644 index 0000000..9a4ac23 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/dataset/result_pid/part0 @@ -0,0 +1,33 @@ +{"id":"58c75fe64b4df0126e0e4fdfafb8be18","pid":"http://hdl.handle.net/11280/86e6ac0d","result_id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816","type":"handle"} +{"id":"45c62956554c7d3e7f9708bce5c9a086","pid":"11280/86e6ac0d","result_id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816","type":"handle"} +{"id":"312a5c89fa6d82ccc66c1b9615d3d364","pid":"10.34847/nkl.7f846pnw","result_id":"50|doi_dedup___::f126b46ff3cea748ffbda3ae4e9ce816","type":"doi"} +{"id":"cb29ee70d77746445ca5ce5f121bc473","pid":"http://hdl.handle.net/11280/747fab4a","result_id":"50|r3f5b9831893::0676bf8b1f33afc121ac4f28e1c3d8ad","type":"handle"} +{"id":"45a465d38aabff009c0fcf41c2f08c67","pid":"11280/747fab4a","result_id":"50|r3f5b9831893::0676bf8b1f33afc121ac4f28e1c3d8ad","type":"handle"} +{"id":"cc956040bd5031ecec943d91e8b764fb","pid":"11280/51909d00","result_id":"50|r3f5b9831893::0b92f33d78d42f54084145b91500941a","type":"handle"} +{"id":"726c5eef33521e505ef9cb48fe75d596","pid":"http://hdl.handle.net/11280/51909d00","result_id":"50|r3f5b9831893::0b92f33d78d42f54084145b91500941a","type":"handle"} +{"id":"32429dfa16fa2847b0286efaf0a0dce8","pid":"11280/fc581aa4","result_id":"50|r3f5b9831893::157349520d61226da5d85e0856bdae3e","type":"handle"} +{"id":"554994db0c44fe13283444e190ac9607","pid":"http://hdl.handle.net/11280/fc581aa4","result_id":"50|r3f5b9831893::157349520d61226da5d85e0856bdae3e","type":"handle"} +{"id":"88a301e2cadf5e691ebb6a5665eb78f4","pid":"http://hdl.handle.net/11280/1cfc2896","result_id":"50|r3f5b9831893::18b2aa2b1b9a2a11da84bc8e1f662070","type":"handle"} +{"id":"2f15200f24a870ff9edb3913e292d61f","pid":"11280/1cfc2896","result_id":"50|r3f5b9831893::18b2aa2b1b9a2a11da84bc8e1f662070","type":"handle"} +{"id":"027c0e2083ab8ea468469a34fe9d46e1","pid":"http://hdl.handle.net/11280/3b2225c5","result_id":"50|r3f5b9831893::31f713b5670d801de154453ea68ff4e1","type":"handle"} +{"id":"8466cbb68b2d1c541b056006b7f27ea4","pid":"11280/3b2225c5","result_id":"50|r3f5b9831893::31f713b5670d801de154453ea68ff4e1","type":"handle"} +{"id":"bac82482f2dba75f8e34802ed7789554","pid":"http://hdl.handle.net/11280/f3911908","result_id":"50|r3f5b9831893::358b383fd0975b292edafd6b1d1fe9a2","type":"handle"} +{"id":"8cd4bb9ef9c8007155a95ee9df90ea69","pid":"11280/f3911908","result_id":"50|r3f5b9831893::358b383fd0975b292edafd6b1d1fe9a2","type":"handle"} +{"id":"ba83be852322c4c86ed6b3ab0610987d","pid":"11280/65056b94","result_id":"50|r3f5b9831893::4cc834e3443c27cb7b0100a470c5c7f9","type":"handle"} +{"id":"93cd2ffff769223cf04034e0db0f6284","pid":"http://hdl.handle.net/11280/65056b94","result_id":"50|r3f5b9831893::4cc834e3443c27cb7b0100a470c5c7f9","type":"handle"} +{"id":"c5dcb6dab6f53a281f96bfbe048858ce","pid":"http://hdl.handle.net/11280/dac5fe22","result_id":"50|r3f5b9831893::516950eba1c6737cbe26a52401b3fb2c","type":"handle"} +{"id":"999076fd410cdb0c1599b7d5e355b94a","pid":"11280/dac5fe22","result_id":"50|r3f5b9831893::516950eba1c6737cbe26a52401b3fb2c","type":"handle"} +{"id":"ef68e036a7e753da17a2794ccf1b8ce5","pid":"http://hdl.handle.net/11280/446e3387","result_id":"50|r3f5b9831893::71182950600db8e6aff20566f9df0345","type":"handle"} +{"id":"5377b0f0143c324176bbee897d9d966c","pid":"11280/446e3387","result_id":"50|r3f5b9831893::71182950600db8e6aff20566f9df0345","type":"handle"} +{"id":"9e588201f52f05fca56efc43583ca615","pid":"http://hdl.handle.net/11280/969ae30a","result_id":"50|r3f5b9831893::99592f6a6bc8b9b67b0d8f1612e310bb","type":"handle"} +{"id":"f64681856cadef587b4c34396e9e6861","pid":"11280/969ae30a","result_id":"50|r3f5b9831893::99592f6a6bc8b9b67b0d8f1612e310bb","type":"handle"} +{"id":"4ad4d6c56ce6e206c42849df92d894f5","pid":"http://hdl.handle.net/11280/dddf5851","result_id":"50|r3f5b9831893::abd19ac4153416d0eb73b8f2e7612d35","type":"handle"} +{"id":"0b3ea2f9c96eb9593fd9b21363b7d9f6","pid":"11280/dddf5851","result_id":"50|r3f5b9831893::abd19ac4153416d0eb73b8f2e7612d35","type":"handle"} +{"id":"45dc28539b305d186f51d5ee9465aee0","pid":"http://hdl.handle.net/11280/3f2679d9","result_id":"50|r3f5b9831893::b26848c3000fbd7153e2fdeaf3d70bd2","type":"handle"} +{"id":"b9c5beb054f3ca72477cb1b07351196a","pid":"11280/3f2679d9","result_id":"50|r3f5b9831893::b26848c3000fbd7153e2fdeaf3d70bd2","type":"handle"} +{"id":"ee0120c72b2f9c1fc1dd3cf47c98ac9d","pid":"http://hdl.handle.net/11280/d957e9f3","result_id":"50|r3f5b9831893::b94d49cfb4ea230b784be1fe24f0edd5","type":"handle"} +{"id":"4770ff66784a0b9470551d46e7a0aaa0","pid":"11280/d957e9f3","result_id":"50|r3f5b9831893::b94d49cfb4ea230b784be1fe24f0edd5","type":"handle"} +{"id":"3cf2316ff497fda37d07757e72307173","pid":"11280/e8d8ed9f","result_id":"50|r3f5b9831893::ef9f1724cef04a9f62bdf90d9084d70b","type":"handle"} +{"id":"5a9092d335d45be6d01f9d6af99c9d86","pid":"http://hdl.handle.net/11280/e8d8ed9f","result_id":"50|r3f5b9831893::ef9f1724cef04a9f62bdf90d9084d70b","type":"handle"} +{"id":"37018c7be9823e3c49aeff0e9ae69054","pid":"http://hdl.handle.net/11280/9ff65944","result_id":"50|r3f5b9831893::f349ea5bdd91d846e70e6a4a3c71ccd6","type":"handle"} +{"id":"c372305e06eacc7855c7de0e3fc6df07","pid":"11280/9ff65944","result_id":"50|r3f5b9831893::f349ea5bdd91d846e70e6a4a3c71ccd6","type":"handle"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/author/part1 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/author/part1 new file mode 100644 index 0000000..28a7797 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/author/part1 @@ -0,0 +1,2 @@ +{"firstname":"Taal En Spraaktechnologie","fullname":"LS OZ Taal en spraaktechnologie","id":"60fa4ab9fa107f5281b91c1db2885bf9","lastname":"Ls Oz","orcid":""} +{"fullname":"Nispen, van, Annelies","id":"1279ef1ced7366cc6af25a2079ab4554","orcid":""} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result/part0 new file mode 100644 index 0000000..963e4ab --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result/part0 @@ -0,0 +1,4 @@ +{"accessright":"OPEN","country":"","description":"","id":"50|core_ac_uk__::15d72bdde1addf525170aa61664f8daf","keywords":"","language":"eng","publication_date":"","publisher":"Springer International Publishing","title":"Reengineering and Reinventing both Democracy and the Concept of Life in the Digital Era","type":"other"} +{"accessright":"OPEN","country":"IT","description":"","id":"50|od______3686::b0cb086c9a0222684d48b3e355eba1c8","keywords":"","language":"und","publication_date":"2002-01-01","publisher":"","title":"Progetto dell’impianto eolico di Pescopagano (Potenza), progetto secondo classificato al Concorso nazionale “Paesaggi del Vento”, progetto pubblicato in: E. Zanchini , a cura di, Paesaggi del vento, Meltemi, Roma 2002 , pp.84-89","type":"other"} +{"accessright":"OPEN","country":"NL","description":"This article reports about the on-going work on a new version of the metadata framework Component Metadata Infrastructure (CMDI), central to the CLARIN infrastructure. Version 1.2 introduces a number of important changes based on the experience gathered in the last five years of intensive use of CMDI by the digital humanities community, addressing problems encountered, but also introducing new functionality. Next to the consolidation of the structure of the model and schema sanity, new means for lifecycle management have been introduced aimed at combatting the observed proliferation of components, new mechanism for use of external vocabularies will contribute to more consistent use of controlled values and cues for tools will allow improved presentation of the metadata records to the human users. The feature set has been frozen and approved, and the infrastructure is now entering a transition phase, in which all the tools and data need to be migrated to the new version.","id":"50|narcis______::07cab979c27c9240f7ef5d80d752679b","keywords":"","language":"eng","publication_date":"2015-08-26","publisher":"Linköping University Electronic Press, Linköpings universitet","title":"CMDI 1.2: Improvements in the CLARIN Component Metadata Infrastructure","type":"other"} +{"accessright":"OPEN","country":"NL","description":"This paper describes what the CLARIN infrastructure is and how it can be used, with a focus on the Netherlands part of the CLARIN infrastructure. It aims to explain how a humanities researcher can use the CLARIN infrastructure.","id":"50|narcis______::655f9ef445ffa66a1782f29208cc1569","keywords":"","language":"eng","publication_date":"2014-08-20","publisher":"UiL OTS","title":"The CLARIN infrastructure in the Netherlands: What is it and how can you use it?","type":"other"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result_author/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result_author/part0 new file mode 100644 index 0000000..ac79f71 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result_author/part0 @@ -0,0 +1,17 @@ +{"author_id":"af07dd90a1f0be8159e52f7f572d1c5c","result_id":"50|narcis______::14afd8c5c46d17af87ceef410ab25e01"} +{"author_id":"9f24c2ed6e1cb057772b641806ae77ec","result_id":"50|narcis______::14afd8c5c46d17af87ceef410ab25e01"} +{"author_id":"9ad1701184de323823fc1a858a868ac2","result_id":"50|narcis______::14afd8c5c46d17af87ceef410ab25e01"} +{"author_id":"de106449e38166d8cf2ac7bb7bb6c5d8","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"8a157b06eaaf9fbca8b67011bc374744","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"10bffdada7578cec278ba1a5e3d63da5","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"d2a8ebfa553c4f6ff90998bd1c58fbcc","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"86b929edfab2d532f075506559a6ac76","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"478c134423c1afa8bb2ee174014726af","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"ba92d49768133c928d102eb86cb3690c","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"d590f7127b93a0b6003cbed3bd20983b","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"c146c73851641e52e6ea1adc6f271fd1","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"e3e6238baf917a025bcbff8be9288393","result_id":"50|r3730f562f9e::36d61b2d7feb632e94e4f8113b890c6d"} +{"author_id":"e1a361a13f6595628524b87b6fa29918","result_id":"50|dedup_wf_001::e9457bd83cfd425b8779f239c96e0ffe"} +{"author_id":"5764f46e7ded9260eadea13e81fdf0fe","result_id":"50|dedup_wf_001::e9457bd83cfd425b8779f239c96e0ffe"} +{"author_id":"b56a640d36a2dc9e3dc88401edb61149","result_id":"50|dedup_wf_001::e9457bd83cfd425b8779f239c96e0ffe"} +{"author_id":"e08632d458b519b66e575dd5b7eb54e9","result_id":"50|dedup_wf_001::e9457bd83cfd425b8779f239c96e0ffe"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result_pid/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result_pid/part0 new file mode 100644 index 0000000..cd92ae4 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/otherresearchproduct/result_pid/part0 @@ -0,0 +1,5 @@ +{"id":"3ff0ab5e679c5320381c857d8699cd4a","pid":"10.5281/zenodo.2657248","result_id":"50|doi_dedup___::84db353272d83833fa76ec87fc540e63","type":"doi"} +{"id":"935716d050a36d36f797e843187b8192","pid":"https://hdl.handle.net/21.11115/0000-000e-0ff1-2","result_id":"50|r369162d0a40::da892118ba0be7a5cf695ad54ae5147e","type":"handle"} +{"id":"133b9dd1a59099adc577004209e83c52","pid":"21.11115/0000-000e-0ff1-2","result_id":"50|r369162d0a40::da892118ba0be7a5cf695ad54ae5147e","type":"handle"} +{"id":"8e17b86e61db6c34ec741eabe947ea9f","pid":"https://hdl.handle.net/21.11115/0000-000e-ce31-3","result_id":"50|r369162d0a40::b69a5145a8e41bdaa33c24be67c209f1","type":"handle"} +{"id":"b7cc730f4cbb6d379d5c4f57369978b3","pid":"21.11115/0000-000e-ce31-3","result_id":"50|r369162d0a40::b69a5145a8e41bdaa33c24be67c209f1","type":"handle"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/author/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/author/part0 new file mode 100644 index 0000000..e66ef9c --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/author/part0 @@ -0,0 +1,4 @@ +{"fullname":"Giovanni Aloisio","id":"5ac035663df4d9099cf92d0e3f22a964","orcid":""} +{"fullname":"Cosimo Palazzo","id":"9f0d3123b6390dd7b2f3cee66c6bc926","orcid":""} +{"firstname":"L","fullname":"L, Issel-Tarver","id":"bafb7637b5f1c692419e55b13bf719a3","lastname":"Issel-Tarver","orcid":""} +{"firstname":"Voula","fullname":"Giouli, Voula","id":"c80f55a9afb32ffc4bc6bb67b6e0df33","lastname":"Giouli","orcid":""} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result/part0 new file mode 100644 index 0000000..c264de7 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result/part0 @@ -0,0 +1,4 @@ +{"accessright":"OPEN","country":"","description":"We describe the CoNLL-2002 shared task: language-independent named entity recognition. We give background information on the data sets and the evaluation method, present a general overview of the systems that have taken part in the task and discuss their performance.","id":"50|doi_dedup___::13b14c741a7b3420591c161f54ed5c80","keywords":"computer science - computation and language, i.2.7, computation and language (cs.cl), fos: computer and information sciences","language":"eng","publication_date":"2002-09-05","publisher":"","title":"Introduction to the CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition","type":"publication"} +{"accessright":"OPEN","country":"GB","description":"Following a strategy similar to that used in baker's yeast (Herrgård et al. Nat Biotechnol 26:1155-1160, 2008). A consensus yeast metabolic network obtained from a community approach to systems biology (Herrgård et al. 2008; Dobson et al. BMC Syst Biol 4:145, 2010). Further developments towards a genome-scale metabolic model of yeast (Dobson et al. 2010; Heavner et al. BMC Syst Biol 6:55, 2012). Yeast 5-an expanded reconstruction of the Saccharomyces cerevisiae metabolic network (Heavner et al. 2012) and in Salmonella typhimurium (Thiele et al. BMC Syst Biol 5:8, 2011). A community effort towards a knowledge-base and mathematical model of the human pathogen Salmonellatyphimurium LT2 (Thiele et al. 2011), a recent paper (Thiele et al. Nat Biotechnol 31:419-425, 2013). A community-driven global reconstruction of human metabolism (Thiele et al. 2013) described a much improved 'community consensus' reconstruction of the human metabolic network, called Recon 2, and the authors (that include the present ones) have made it freely available via a database at http://humanmetabolism.org/ and in SBML format at Biomodels (http://identifiers.org/biomodels.db/MODEL1109130000. This short analysis summarises the main findings, and suggests some approaches that will be able to exploit the availability of this model to advantage. © 2013 The Author(s).","id":"50|doi_dedup___::e0392f427fea9a701aa469e6f24bdf93","keywords":"review article, metabolism, modelling, systems biology, networks, metabolic networks, clinical biochemistry, biochemistry, endocrinology, diabetes and metabolism, community approach, operations research, metabolic network, human metabolism, metabolic model, biology, computational biology, sbml, 03 medical and health sciences, 0302 clinical medicine, 0303 health sciences, 030220 oncology & carcinogenesis, 030304 developmental biology, researchinstitutes_networks_beacons/manchester_institute_of_biotechnology, manchester institute of biotechnology","language":"eng","publication_date":"2013-08-01","publisher":"Springer US","title":"An analysis of a ‘community-driven’ reconstruction of the human metabolic network","type":"publication"} +{"accessright":"OPEN","country":"","description":"Current machine learning systems operate, almost exclusively, in a statistical, or model-free mode, which entails severe theoretical limits on their power and performance. Such systems cannot reason about interventions and retrospection and, therefore, cannot serve as the basis for strong AI. To achieve human level intelligence, learning machines need the guidance of a model of reality, similar to the ones used in causal inference tasks. To demonstrate the essential role of such models, I will present a summary of seven tasks which are beyond reach of current machine learning systems and which have been accomplished using the tools of causal modeling.","id":"50|doi_dedup___::2436e90941a664931b54b956ade5b77b","keywords":"machine learning (cs.lg), artificial intelligence (cs.ai), machine learning (stat.ml), fos: computer and information sciences, mode (statistics), causal inference, artificial intelligence, business.industry, business, power (physics), computer science, machine learning, computer.software_genre, computer, basis (linear algebra), 03 medical and health sciences, 02 engineering and technology, 0202 electrical engineering, electronic engineering, information engineering, 0301 basic medicine, 020201 artificial intelligence & image processing, 030104 developmental biology, computer science - learning, computer science - artificial intelligence, statistics - machine learning","language":"und","publication_date":"2018-02-02","publisher":"arXiv","title":"Theoretical Impediments to Machine Learning With Seven Sparks from the Causal Revolution","type":"publication"} +{"accessright":"OPEN","country":"","description":"In most natural and engineered systems, a set of entities interact with each other in complicated patterns that can encompass multiple types of relationships, change in time, and include other types of complications. Such systems include multiple subsystems and layers of connectivity, and it is important to take such \"multilayer\" features into account to try to improve our understanding of complex systems. Consequently, it is necessary to generalize \"traditional\" network theory by developing (and validating) a framework and associated tools to study multilayer systems in a comprehensive fashion. The origins of such efforts date back several decades and arose in multiple disciplines, and now the study of multilayer networks has become one of the most important directions in network science. In this paper, we discuss the history of multilayer networks (and related concepts) and review the exploding body of work on such networks. To unify the disparate terminology in the large body of recent work, we discuss a general framework for multilayer networks, construct a dictionary of terminology to relate the numerous existing concepts to each other, and provide a thorough discussion that compares, contrasts, and translates between related notions such as multilayer networks, multiplex networks, interdependent networks, networks of networks, and many others. We also survey and discuss existing data sets that can be represented as multilayer networks. We review attempts to generalize single-layer-network diagnostics to multilayer networks. We also discuss the rapidly expanding research on multilayer-network models and notions like community structure, connected components, tensor decompositions, and various types of dynamical processes on multilayer networks. We conclude with a summary and an outlook.","id":"50|doi_dedup___::c5a574592f2e347f27be49d2c20a5558","keywords":"applied mathematics, computational mathematics, control and optimization, management science and operations research, computer networks and communications, data science, connected component, terminology, complex system, network theory, network science, construct (philosophy), computer science, interdependent networks, set (psychology), 01 natural sciences, 0103 physical sciences, 010306 general physics, 010305 fluids & plasmas, physics - physics and society, computer science - social and information networks, physics and society (physics.soc-ph), social and information networks (cs.si), fos: physical sciences, fos: computer and information sciences","language":"und","publication_date":"2013-09-27","publisher":"Oxford University Press (OUP)","title":"Multilayer networks","type":"publication"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result_author/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result_author/part0 new file mode 100644 index 0000000..b386db3 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result_author/part0 @@ -0,0 +1,17 @@ +{"author_id":"6fa85e5d3da0c5ed3ab65e4423481714","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"dad3b6e22750b26a27296cd1c98565d1","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"121d8003d3895905cfd67b9b69ac99e1","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"91d3d8c07152d64fbf1c059940211334","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"a25d1cc688c34c0458a4b00b48bc4cdc","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"968ad30220675afb7a0b2b583b35c3a1","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"a55af296962dfb58977aabcb3cf6a8d9","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"5a344a09dab274779fd8e34654fd3541","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"77104c891595df750391d710280da022","result_id":"50|doi_dedup___::b2ffae13a6f06b87539d538dc4919df7"} +{"author_id":"148f572c63c1f22386c1cae02e5bae2d","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"} +{"author_id":"8e571c27bc66cf96051302db9aa903dc","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"} +{"author_id":"175e45bf98e2b74df9c888598bb917fc","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"} +{"author_id":"bcdeabeece29231977e580b8f417ea82","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"} +{"author_id":"11cea0826b37ff58aa2f4c12ec42695e","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"} +{"author_id":"faf54def0161659b903f58ab4ce8bfae","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"} +{"author_id":"088daddc0f62bc2b8700a4e66a399d5f","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"} +{"author_id":"0b78df096d451535b5b8f7f4f47a6433","result_id":"50|doi_dedup___::2cd92ff12dd2fa919308d9438d9058b6"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result_pid/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result_pid/part0 new file mode 100644 index 0000000..d969f29 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result_pid/part0 @@ -0,0 +1,12 @@ +{"id":"94c1431ed983f9ea9996650e2d2205cc","pid":"10.5281/zenodo.3529160","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"doi"} +{"id":"f2328b2e830ee5c03945f65ab1802af7","pid":"10.3389/fphar.2019.01303","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"doi"} +{"id":"53511fa534223420fb925c58051725d6","pid":"31749705","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"pmid"} +{"id":"0e254059fe10cf07df8dbae2cfe5797e","pid":"pmc6848277","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"pmc"} +{"id":"a6181896a32edebf1c45649b894e5886","pid":"10.5281/zenodo.3529159","result_id":"50|doi_dedup___::564289a1b69707f216d73aafdd70b20e","type":"doi"} +{"id":"6e2dc8a4fd3523656a5abd3c0e090a18","pid":"10.7287/peerj.preprints.2711v2","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"doi"} +{"id":"2072bbca91cb3f3a05b2454edce57f6f","pid":"10.1371/journal.pbio.1002614","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"doi"} +{"id":"a4e63567711400f9526cc46ca84d2bc1","pid":"pmc5655613","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"pmc"} +{"id":"477cabc52ec11dfaec8631ee1073376d","pid":"29065148","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"pmid"} +{"id":"27285b8c2487b534fc2196d27ad4cf0d","pid":"10.7287/peerj.preprints.2711v3","result_id":"50|doi_dedup___::612838ab331dcdfeb9862351bd3fb423","type":"doi"} +{"id":"056a211b8f85fe3058825df170960c06","pid":"10.1111/cgf.13610","result_id":"50|doi_dedup___::32c3649d7aa266f3d754463d6194ebd5","type":"doi"} +{"id":"79c575556941fbb62d9eee77b97fd0e4","pid":"1902.06815","result_id":"50|doi_dedup___::32c3649d7aa266f3d754463d6194ebd5","type":"arxiv"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/author/part1 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/author/part1 new file mode 100644 index 0000000..6691cd2 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/author/part1 @@ -0,0 +1,2 @@ +{"firstname":"Maurizio","fullname":"Toscano, Maurizio","id":"045bdce3ee24842af4eb4a7f89a44adb","lastname":"Toscano","orcid":""} +{"firstname":"","fullname":"Aitor Díaz","id":"25fc898122164b69f56f08a8545804d3","lastname":"","orcid":""} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result/part0 new file mode 100644 index 0000000..c2501c3 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result/part0 @@ -0,0 +1 @@ +{"accessright":"OPEN","country":"","description":"

Mapping digital humanities in Spain (1993-2019)

This dataset has been extensively analysed in the following paper https://doi.org/10.3145/epi.2020.nov.01 and has also been used for the following poster https://doi.org/10.5281/zenodo.4256689

","id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423","keywords":"","language":"esl/spa","publication_date":"2020-06-14","publisher":"Zenodo","title":"Mapping digital humanities in Spain - 1993-2019","type":"software"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result_author/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result_author/part0 new file mode 100644 index 0000000..fe7f499 --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result_author/part0 @@ -0,0 +1,2 @@ +{"author_id":"045bdce3ee24842af4eb4a7f89a44adb","result_id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423"} +{"author_id":"25fc898122164b69f56f08a8545804d3","result_id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result_pid/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result_pid/part0 new file mode 100644 index 0000000..b72038a --- /dev/null +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/software/result_pid/part0 @@ -0,0 +1,2 @@ +{"id":"cb7d0c2e4660c784cb647060974dbee7","pid":"10.5281/zenodo.3893545","result_id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423","type":"doi"} +{"id":"19703b43918fc184698f6e0298bf2fc8","pid":"10.5281/zenodo.3893546","result_id":"50|doi_dedup___::57c23b72fc2da4d47b35e5b871c35423","type":"doi"} \ No newline at end of file From e87b790a609835a0666b527f42e5e0af8331b8fc Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 22 Jun 2023 16:54:13 +0200 Subject: [PATCH 13/19] - --- .../dhp/oa/graph/dump/SendToZenodoHDFS.java | 6 +- .../oa/graph/dump/csv/SparkDumpResults.java | 28 +- .../SparkSelectResultsAndDumpRelations.java | 241 ++++++++++++++++++ .../oa/graph/dump/csv/oozie_app/workflow.xml | 2 +- .../serafeim/oozie_app/config-default.xml | 30 +++ .../dump/serafeim/oozie_app/workflow.xml | 102 ++++++++ .../dhp/oa/graph/dump/csv/DumpResultTest.java | 109 ++++---- .../graph/dump/csv/MoveOnSingleDirTest.java | 6 +- .../dhp/oa/graph/dump/csv/input/publication | 3 +- .../dhp/oa/graph/dump/csv/input/relation | 3 +- .../dump/csv/working/publication/result/part0 | 3 +- .../dump/csv/working/resultIds/part-00000 | 1 + pom.xml | 2 +- 13 files changed, 466 insertions(+), 70 deletions(-) create mode 100644 dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/serafeim/SparkSelectResultsAndDumpRelations.java create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/serafeim/oozie_app/config-default.xml create mode 100644 dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/serafeim/oozie_app/workflow.xml diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/SendToZenodoHDFS.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/SendToZenodoHDFS.java index 685af91..71c10be 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/SendToZenodoHDFS.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/SendToZenodoHDFS.java @@ -82,9 +82,9 @@ public class SendToZenodoHDFS implements Serializable { if (!pString.endsWith("_SUCCESS")) { String name = pString.substring(pString.lastIndexOf("/") + 1); - FSDataInputStream inputStream = fileSystem.open(p); - zenodoApiClient.uploadIS(inputStream, name); - + try (FSDataInputStream inputStream = fileSystem.open(p)) { + zenodoApiClient.uploadIS(inputStream, name); + } } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java index 9fd10f1..f94ad8f 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.graph.dump.csv; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static org.apache.commons.lang3.StringUtils.remove; import static org.apache.commons.lang3.StringUtils.split; import java.io.Serializable; @@ -87,6 +88,7 @@ public class SparkDumpResults implements Serializable { Class inputClazz, String resultType, String workingPath) { Dataset resultIds = spark.read().textFile(workingPath + "/resultIds"); + // resultIds.foreach((ForeachFunction) r -> System.out.println(r)); Dataset results = Utils .readPath(spark, inputPath + "/" + resultType, inputClazz) .filter( @@ -108,8 +110,6 @@ public class SparkDumpResults implements Serializable { Encoders.bean(CSVResult.class)) .write() .option("compression", "gzip") -// .option("header", "true") -// .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) .json(workingPath + "/" + resultType + "/result"); @@ -125,8 +125,6 @@ public class SparkDumpResults implements Serializable { .filter(Objects::nonNull) .write() .option("compression", "gzip") -// .option("header", "true") -// .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) .json(workingPath + "/" + resultType + "/result_pid"); @@ -186,8 +184,6 @@ public class SparkDumpResults implements Serializable { Encoders.bean(CSVRelResAut.class)) .write() .option("compression", "gzip") -// .option("header", "true") -// .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) .json(workingPath + "/" + resultType + "/result_author"); @@ -199,8 +195,6 @@ public class SparkDumpResults implements Serializable { Encoders.bean(CSVAuthor.class)) .write() .option("compression", "gzip") -// .option("header", "true") -// .option("delimiter", Constants.SEP) .mode(SaveMode.Overwrite) .json(workingPath + "/" + resultType + "/author"); @@ -264,7 +258,7 @@ public class SparkDumpResults implements Serializable { private static String getFieldValue(Field input) { if (input != null && StringUtils.isNotEmpty(input.getValue())) { - return input.getValue(); + return removeBreaks(input.getValue()); } else { return ""; } @@ -283,7 +277,7 @@ public class SparkDumpResults implements Serializable { if (Optional.ofNullable(r.getSubject()).isPresent()) ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> { if (StringUtils.isNotEmpty(s.getValue())) - return s.getValue().toLowerCase(); + return removeBreaks(s.getValue().toLowerCase()); else return null; }).filter(Objects::nonNull).distinct().collect(Collectors.toList()))); @@ -311,7 +305,7 @@ public class SparkDumpResults implements Serializable { return ""; for (Field abs : description) { if (StringUtils.isNotEmpty(abs.getValue())) { - return abs.getValue(); + return removeBreaks(abs.getValue()); } } return ""; @@ -322,14 +316,22 @@ public class SparkDumpResults implements Serializable { for (StructuredProperty title : titles) { if (StringUtils.isEmpty(firstTitle)) { if (StringUtils.isNotEmpty(title.getValue())) - firstTitle = title.getValue(); + firstTitle = removeBreaks(title.getValue()); } if (title.getQualifier().getClassid().equals(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid())) { if (StringUtils.isNotEmpty(title.getValue())) - return title.getValue(); + return removeBreaks(title.getValue()); } } + if (firstTitle != null) { + return removeBreaks(firstTitle); + } return ""; } + private static String removeBreaks(String input) { + return input.replace("\n", " ").replace("\t", " ").replace("\r", " "); + + } + } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/serafeim/SparkSelectResultsAndDumpRelations.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/serafeim/SparkSelectResultsAndDumpRelations.java new file mode 100644 index 0000000..1f31c3c --- /dev/null +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/serafeim/SparkSelectResultsAndDumpRelations.java @@ -0,0 +1,241 @@ + +package eu.dnetlib.dhp.oa.graph.dump.serafeim; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import scala.Tuple2; + +/** + * @author miriam.baglioni + * @Date 04/05/23 + */ +//STEP 2 +public class SparkSelectResultsAndDumpRelations implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(SparkSelectResultsAndDumpRelations.class); + private static String RESULT_COMMUNITY_TABLE = "/result_community"; + private static String COMMUNITY_RESULT_IDS = "/communityResultIds"; + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkSelectResultsAndDumpRelations.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dump/input_dump_csv_ste2.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String workingPath = parser.get("workingPath"); + + List communityList = null; + Optional communities = Optional.ofNullable(parser.get("communities")); + if (communities.isPresent()) { + communityList = Arrays.asList(communities.get().split(";")); + } + + SparkConf conf = new SparkConf(); + + List finalCommunityList = communityList; + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Utils.removeOutputDir(spark, outputPath); + run(spark, inputPath, outputPath, workingPath, finalCommunityList); + + }); + + } + + private static void run(SparkSession spark, String inputPath, String outputPath, + String workingPath, + List communityList) { + + // select the result ids related to the set of communities considered + writeCommunityRelatedIds( + spark, inputPath, Publication.class, communityList, workingPath, "publication"); + writeCommunityRelatedIds( + spark, inputPath, Dataset.class, communityList, workingPath, "dataset"); + writeCommunityRelatedIds( + spark, inputPath, Software.class, communityList, workingPath, "software"); + writeCommunityRelatedIds( + spark, inputPath, OtherResearchProduct.class, communityList, + workingPath, "otherresearchproduct"); + + // select the relations with semantics cites + org.apache.spark.sql.Dataset relations = Utils + .readPath(spark, inputPath + "/relation", Relation.class) + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + r.getRelClass().equals(ModelConstants.CITES)); + + // select the relations having as source one of the results related to the + // communities + org.apache.spark.sql.Dataset communityResultIds = spark + .read() + .textFile(workingPath + COMMUNITY_RESULT_IDS) + .distinct(); + + Utils + .readPath(spark, inputPath + "/publication", Publication.class) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() + && !p.getDataInfo().getInvisible()) + .map((MapFunction) p -> p.getId(), Encoders.STRING()) + .union( + Utils + .readPath(spark, inputPath + "/dataset", Dataset.class) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() + && !p.getDataInfo().getInvisible()) + .map((MapFunction) p -> p.getId(), Encoders.STRING())) + .union( + Utils + .readPath(spark, inputPath + "/software", Software.class) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() + && !p.getDataInfo().getInvisible()) + .map((MapFunction) p -> p.getId(), Encoders.STRING())) + .union( + Utils + .readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() + && !p.getDataInfo().getInvisible()) + .map((MapFunction) p -> p.getId(), Encoders.STRING())) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(workingPath + "/resultIds"); + + org.apache.spark.sql.Dataset resultIds = spark.read().textFile(workingPath + "/resultIds"); + + org.apache.spark.sql.Dataset oksource = communityResultIds + .joinWith(relations, communityResultIds.col("value").equalTo(relations.col("source"))) + .map( + (MapFunction, Relation>) t2 -> t2._2(), + Encoders.bean(Relation.class)); + oksource + .joinWith(resultIds, oksource.col("target").equalTo(resultIds.col("value"))) + .map((MapFunction, Relation>) t2 -> t2._1(), Encoders.bean(Relation.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(outputPath + "/relation"); + + writeNodes( + spark, inputPath + "/publication", Publication.class, outputPath + "/publication", + outputPath + "/relation", workingPath); + writeNodes( + spark, inputPath + "/dataset", Dataset.class, outputPath + "/dataset", outputPath + "/relation", + workingPath); + writeNodes( + spark, inputPath + "/software", Software.class, outputPath + "/software", outputPath + "/relation", + workingPath); + writeNodes( + spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class, + outputPath + "/otherresearchproduct", outputPath + "/relation", workingPath); + + } + + private static void writeNodes(SparkSession spark, String inputPath, Class clazz, + String outputPath, String relationPath, String workingPath) { + org.apache.spark.sql.Dataset citingRelations = Utils.readPath(spark, relationPath, Relation.class); + org.apache.spark.sql.Dataset result = Utils + .readPath(spark, inputPath, clazz) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && + !p.getDataInfo().getInvisible()); + + // take the distinct result id for source and target of the relations + citingRelations + .flatMap( + (FlatMapFunction) r -> Arrays + .asList(r.getSource(), r.getTarget()) + .iterator(), + Encoders.STRING()) + .distinct() + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .text(workingPath + "/relationIds"); + + org.apache.spark.sql.Dataset relationIds = spark.read().textFile(workingPath + "/relationIds"); + + relationIds + .joinWith(result, relationIds.col("value").equalTo(result.col("id"))) + .map((MapFunction, R>) t2 -> t2._2(), Encoders.bean(clazz)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(outputPath); + } + + private static void writeCommunityRelatedIds(SparkSession spark, String inputPath, + Class clazz, List communityList, String outputPath, String resultType) { + org.apache.spark.sql.Dataset results = Utils + .readPath(spark, inputPath + "/" + resultType, clazz) + .filter( + (FilterFunction) p -> !p.getDataInfo().getDeletedbyinference() && + !p.getDataInfo().getInvisible() && + isRelatedToCommunities(p, communityList)); + results + .map((MapFunction) Result::getId, Encoders.STRING()) + .write() + .option("compression", "gzip") + .mode(SaveMode.Append) + .text(outputPath + COMMUNITY_RESULT_IDS); + +// results +// // .repartition(10000) +// .write() +// .option("compression", "gzip") +// .mode(SaveMode.Append) +// .json(outputPath + "/" + resultType); + + } + + private static boolean isRelatedToCommunities(R p, List communityList) { + return p + .getContext() + .stream() + .anyMatch( + c -> communityList.contains(c.getId()) || + (c.getId().contains("::") + && communityList.contains(c.getId().substring(0, c.getId().indexOf("::"))))); + } + +} diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml index 42fa50a..543de1c 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml @@ -65,7 +65,7 @@ - + diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/serafeim/oozie_app/config-default.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/serafeim/oozie_app/config-default.xml new file mode 100644 index 0000000..d262cb6 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/serafeim/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire + + + oozie.launcher.mapreduce.user.classpath.first + true + + diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/serafeim/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/serafeim/oozie_app/workflow.xml new file mode 100644 index 0000000..dc9ead6 --- /dev/null +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/serafeim/oozie_app/workflow.xml @@ -0,0 +1,102 @@ + + + + sourcePath + the source path + + + outputPath + the output path + + + communities + the communities whose products should be dumped + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + yarn + cluster + select results ids connected to communities and dump relation + eu.dnetlib.dhp.oa.graph.dump.serafeim.SparkSelectResultsAndDumpRelations + dump-${projectVersion}.jar + + --executor-memory=10G + --executor-cores=3 + --driver-memory=10G + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=3840 + + --sourcePath${sourcePath} + --workingPath${workingDir} + --outputPath${outputPath} + --communities${communities} + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + \ No newline at end of file diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java index 9ef6ea1..4f3c4f9 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java @@ -13,7 +13,10 @@ import java.util.Optional; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.ForeachFunction; import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.dom4j.Document; @@ -30,8 +33,12 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.utils.DHPUtils; +import scala.Function1; /** * @author miriam.baglioni @@ -96,7 +103,7 @@ public class DumpResultTest { SparkDumpResults.main(new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", "-resultType", "publication", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", @@ -105,61 +112,69 @@ public class DumpResultTest { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - Dataset tmp = spark - .read() - .option("header", "true") - .option("delimiter", Constants.SEP) - .csv(workingDir.toString() + "/working/publication/result"); + Dataset tmp = Utils + .readPath(spark, workingDir.toString() + "/working/publication/result", CSVResult.class); - Assertions.assertEquals(3, tmp.count()); - Row row = tmp - .where("id = '50|DansKnawCris::0224aae28af558f21768dbc6439c7a95'") + tmp.show(false); + + Assertions.assertEquals(4, tmp.count()); + CSVResult row = tmp + .filter( + (FilterFunction) r -> r.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) .first(); - Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright")); - Assertions.assertEquals("FI", row.getAs("country")); - Assertions.assertEquals("Lit.opg., bijl.", row.getAs("description")); - Assertions.assertEquals(3, split(row.getAs("keywords"), ", ").length); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); - Assertions.assertTrue(row.getAs("keywords").toString().contains("prospectie")); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); - Assertions.assertEquals("nl", row.getAs("language")); - Assertions.assertEquals("2007-01-01", row.getAs("publication_date")); - Assertions.assertEquals("FakePublisher1", row.getAs("publisher")); + Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAccessright()); + Assertions.assertEquals("FI", row.getCountry()); + Assertions.assertEquals("Lit.opg., bijl.", row.getDescription()); + Assertions.assertEquals(3, split(row.getKeywords(), ", ").length); + Assertions.assertTrue(row.getKeywords().toString().contains("archeologie")); + Assertions.assertTrue(row.getKeywords().toString().contains("prospectie")); + Assertions.assertTrue(row.getKeywords().toString().contains("archaeology")); + Assertions.assertEquals("nl", row.getLanguage()); + Assertions.assertEquals("2007-01-01", row.getPublication_date()); + Assertions.assertEquals("FakePublisher1", row.getPublisher()); Assertions .assertEquals( "Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel", - row.getAs("title")); - Assertions.assertEquals("publication", row.getAs("type")); + row.getTitle()); + Assertions.assertEquals("publication", row.getType()); row = tmp - .where("id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'") + .filter( + (FilterFunction) r -> r.getId().equals("50|doi_________::715fec7723208e6f17e855c204656e2f")) .first(); - Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright")); - Assertions.assertEquals(2, split(row.getAs("country"), ", ").length); - Assertions.assertNull(row.getAs("description")); - Assertions.assertEquals(2, split(row.getAs("keywords"), ", ").length); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); - Assertions.assertEquals("UNKNOWN", row.getAs("language")); - Assertions.assertNull(row.getAs("publication_date")); - Assertions.assertNull(row.getAs("publisher")); - Assertions.assertEquals("None", row.getAs("title")); - Assertions.assertEquals("publication", row.getAs("type")); - row = tmp - .where("id = '50|DansKnawCris::26780065282e607306372abd0d808245'") - .first(); - Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright")); - Assertions.assertNull(row.getAs("country")); - Assertions.assertNull(row.getAs("description")); - Assertions.assertEquals(2, split(row.getAs("keywords"), ", ").length); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); - Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); - Assertions.assertEquals("UNKNOWN", row.getAs("language")); - Assertions.assertNull(row.getAs("publication_date")); - Assertions.assertNull(row.getAs("publisher")); - Assertions.assertEquals("None", row.getAs("title")); - Assertions.assertEquals("publication", row.getAs("type")); + System.out.println(row.getPublisher()); + String a = row.getPublisher().replace("\\n", " "); + System.out.println(a); +// row = tmp +// .where("id = '50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9'") +// .first(); +// Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright")); +// Assertions.assertEquals(2, split(row.getAs("country"), ", ").length); +// Assertions.assertNull(row.getAs("description")); +// Assertions.assertEquals(2, split(row.getAs("keywords"), ", ").length); +// Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); +// Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); +// Assertions.assertEquals("UNKNOWN", row.getAs("language")); +// Assertions.assertNull(row.getAs("publication_date")); +// Assertions.assertNull(row.getAs("publisher")); +// Assertions.assertEquals("None", row.getAs("title")); +// Assertions.assertEquals("publication", row.getAs("type")); +// +// row = tmp +// .where("id = '50|DansKnawCris::26780065282e607306372abd0d808245'") +// .first(); +// Assertions.assertEquals(ModelConstants.OPEN_ACCESS_RIGHT().getClassid(), row.getAs("accessright")); +// Assertions.assertNull(row.getAs("country")); +// Assertions.assertNull(row.getAs("description")); +// Assertions.assertEquals(2, split(row.getAs("keywords"), ", ").length); +// Assertions.assertTrue(row.getAs("keywords").toString().contains("archeologie")); +// Assertions.assertTrue(row.getAs("keywords").toString().contains("archaeology")); +// Assertions.assertEquals("UNKNOWN", row.getAs("language")); +// Assertions.assertNull(row.getAs("publication_date")); +// Assertions.assertNull(row.getAs("publisher")); +// Assertions.assertEquals("None", row.getAs("title")); +// Assertions.assertEquals("publication", row.getAs("type")); } diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/MoveOnSingleDirTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/MoveOnSingleDirTest.java index dd8efca..279ba40 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/MoveOnSingleDirTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/MoveOnSingleDirTest.java @@ -98,12 +98,14 @@ public class MoveOnSingleDirTest { .option("delimiter", Constants.SEP) .csv(workingDir.toString() + "/output/result"); - Assertions.assertEquals(21, tmp.count()); + Assertions.assertEquals(22, tmp.count()); Assertions.assertEquals(12, tmp.filter("type == 'dataset'").count()); Assertions.assertEquals(4, tmp.filter("type == 'other'").count()); - Assertions.assertEquals(4, tmp.filter("type == 'publication'").count()); + Assertions.assertEquals(5, tmp.filter("type == 'publication'").count()); Assertions.assertEquals(1, tmp.filter("type == 'software'").count()); + tmp.filter("type == 'publication'").show(false); + Assertions .assertEquals( 8, spark diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication index 2b003e7..9ccfc32 100644 --- a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication @@ -1,4 +1,5 @@ {"author":[{"affiliation":[],"fullname":"Alrasheed, Maryam","name":"Maryam","pid":[],"rank":1,"surname":"Alrasheed"},{"affiliation":[],"fullname":"Blondin, Michael","name":"Michael","pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0003-2914-2734"}],"rank":1,"surname":"Blondin"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}, {"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"},{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"dh-ch"}],"contributor":[],"country":[{"classid":"FI","classname":"Finland","dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"country:instrepos","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"schemeid":"dnet:countries","schemename":"dnet:countries"}],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"dateofcollection":"","dateoftransformation":"2020-05-25T16:14:18.452Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Lit.opg., bijl."}],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"nl","classname":"nl","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282676557,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:51:24Z","harvestDate":"2020-05-25T11:33:13.427Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550013110","metadataNamespace":""}},"originalId":["DansKnawCris::0224aae28af558f21768dbc6439c7a95"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550013110"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1023/fakedoi"}],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"FakePublisher1"},"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"prospectie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Inventariserend veldonderzoek d.m.v. boringen (karterende fase) : Raadhuisstraat te Dirkshorn, gemeente Harenkarspel"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Synthegra Archeologie Rapportenreeks P0502381"}],"journal":null} {"author":[{"affiliation":[],"fullname":"Blondin, Michael","name":"Michael","pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0003-2914-2734"}],"rank":1,"surname":"Blondin"},{"affiliation":[],"fullname":"Raskin, Mikhail","name":"Mikhail","pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid_pending","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0002-6660-5673"}],"rank":2,"surname":"Raskin"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[{"classid":"IT","classname":"Finland","dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"country:instrepos","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"schemeid":"dnet:countries","schemename":"dnet:countries"},{"classid":"FI","classname":"Finland","dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"country:instrepos","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"schemeid":"dnet:countries","schemename":"dnet:countries"}],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:03:57.761Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591283087415,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:58:39Z","harvestDate":"2020-05-25T11:34:38.707Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce-kb:document:800020324","metadataNamespace":""}},"originalId":["DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceFullCatalogue&search=priref=800020324"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} {"author":[{"affiliation":[],"fullname":"Ward, Mark Daniel","name":"Mark Daniel","pid":[],"rank":1,"surname":"Ward"},{"affiliation":[],"fullname":"Szpankowski, Wojciech","name":"Wojciech","pid":[],"rank":2,"surname":"Szpankowski"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d808245","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} -{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d80fake","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} \ No newline at end of file +{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d80fake","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} +{"dataInfo": {"invisible": false, "trust": "0.9", "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "deletedbyinference": false}, "resourcetype": {"classid": "0013", "classname": "Part of book or chapter of book", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}, "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1090/dimacs/044/20"}], "contributor": [], "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "created", "classname": "created", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2017-04-27"}, {"qualifier": {"classid": "published-print", "classname": "published-print", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "1998-10-19"}], "collectedfrom": [{"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}], "id":"50|doi_________::715fec7723208e6f17e855c204656e2f", "subject": [], "dateofacceptance": {"value": "1998-10-19"}, "lastupdatetimestamp": 1675978002598, "author": [{"surname": "Deaton", "fullname": "R. Deaton", "pid": [], "name": "R.", "rank": 1}, {"surname": "Murphy", "fullname": "R. Murphy", "pid": [], "name": "R.", "rank": 2}, {"surname": "Garzon", "fullname": "M. Garzon", "pid": [], "name": "M.", "rank": 3}, {"surname": "Franceschetti", "fullname": "D. Franceschetti", "pid": [], "name": "D.", "rank": 4}, {"surname": "Stevens", "fullname": "S. Stevens", "pid": [], "name": "S.", "rank": 5}], "instance": [{"refereed": {"classid": "0000", "classname": "UNKNOWN", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "collectedfrom": {"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}, "hostedby": {"dataInfo": {"invisible": false, "deletedbyinference": false}, "value": "Unknown Repository", "key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "dateofacceptance": {"value": "1998-10-19"}, "url": ["https://doi.org/10.1090/dimacs/044/20"], "measures": [{"id": "influence", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "1.9184702E-8", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "5.79069E-9", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "influence_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "51", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "0.8491071", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "impulse", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "2", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}], "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1090/dimacs/044/20"}], "instancetype": {"classid": "0013", "classname": "Part of book or chapter of book", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "dateofcollection": "2023-02-09T21:26:42Z", "fulltext": [], "description": [], "format": [], "measures": [{"id": "influence", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "1.7008906E-8", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "5.1452145E-9", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "influence_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "51", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "0.50946426", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "impulse", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "2", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}], "coverage": [], "externalReference": [], "publisher": {"value": "American Mathematical\\n Society"}, "eoscifguidelines": [], "language": {"classid": "und", "classname": "Undetermined", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["10.1090/dimacs/044/20", "50|doiboost____::715fec7723208e6f17e855c204656e2f"], "source": [{"value": "Crossref"}], "context": [], "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Good encodings for DNA-based solutions to combinatorial problems"}]} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/relation b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/relation index 1a9a370..9987812 100644 --- a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/relation +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/relation @@ -1,4 +1,5 @@ {"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"} {"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"Cites","relType":"datasourceOrganization","source":"50|DansKnawCris::26780065282e607306372abd0d808245","subRelType":"provision","target":"50|DansKnawCris::26780065282e607306372abd0d808246"} {"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"Cites","relType":"datasourceOrganization","source":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","subRelType":"provision","target":"50|DansKnawCris::26780065282e607306372abd0d808245"} -{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"Cites","relType":"datasourceOrganization","source":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","subRelType":"provision","target":"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"} \ No newline at end of file +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"Cites","relType":"datasourceOrganization","source":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","subRelType":"provision","target":"50|DansKnawCris::0224aae28af558f21768dbc6439c7a95"} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"Cites","relType":"datasourceOrganization","source":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","subRelType":"provision","target":"50|doi_________::715fec7723208e6f17e855c204656e2f"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result/part0 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result/part0 index c264de7..704eacd 100644 --- a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result/part0 +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/publication/result/part0 @@ -1,4 +1,5 @@ {"accessright":"OPEN","country":"","description":"We describe the CoNLL-2002 shared task: language-independent named entity recognition. We give background information on the data sets and the evaluation method, present a general overview of the systems that have taken part in the task and discuss their performance.","id":"50|doi_dedup___::13b14c741a7b3420591c161f54ed5c80","keywords":"computer science - computation and language, i.2.7, computation and language (cs.cl), fos: computer and information sciences","language":"eng","publication_date":"2002-09-05","publisher":"","title":"Introduction to the CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition","type":"publication"} {"accessright":"OPEN","country":"GB","description":"Following a strategy similar to that used in baker's yeast (Herrgård et al. Nat Biotechnol 26:1155-1160, 2008). A consensus yeast metabolic network obtained from a community approach to systems biology (Herrgård et al. 2008; Dobson et al. BMC Syst Biol 4:145, 2010). Further developments towards a genome-scale metabolic model of yeast (Dobson et al. 2010; Heavner et al. BMC Syst Biol 6:55, 2012). Yeast 5-an expanded reconstruction of the Saccharomyces cerevisiae metabolic network (Heavner et al. 2012) and in Salmonella typhimurium (Thiele et al. BMC Syst Biol 5:8, 2011). A community effort towards a knowledge-base and mathematical model of the human pathogen Salmonellatyphimurium LT2 (Thiele et al. 2011), a recent paper (Thiele et al. Nat Biotechnol 31:419-425, 2013). A community-driven global reconstruction of human metabolism (Thiele et al. 2013) described a much improved 'community consensus' reconstruction of the human metabolic network, called Recon 2, and the authors (that include the present ones) have made it freely available via a database at http://humanmetabolism.org/ and in SBML format at Biomodels (http://identifiers.org/biomodels.db/MODEL1109130000. This short analysis summarises the main findings, and suggests some approaches that will be able to exploit the availability of this model to advantage. © 2013 The Author(s).","id":"50|doi_dedup___::e0392f427fea9a701aa469e6f24bdf93","keywords":"review article, metabolism, modelling, systems biology, networks, metabolic networks, clinical biochemistry, biochemistry, endocrinology, diabetes and metabolism, community approach, operations research, metabolic network, human metabolism, metabolic model, biology, computational biology, sbml, 03 medical and health sciences, 0302 clinical medicine, 0303 health sciences, 030220 oncology & carcinogenesis, 030304 developmental biology, researchinstitutes_networks_beacons/manchester_institute_of_biotechnology, manchester institute of biotechnology","language":"eng","publication_date":"2013-08-01","publisher":"Springer US","title":"An analysis of a ‘community-driven’ reconstruction of the human metabolic network","type":"publication"} {"accessright":"OPEN","country":"","description":"Current machine learning systems operate, almost exclusively, in a statistical, or model-free mode, which entails severe theoretical limits on their power and performance. Such systems cannot reason about interventions and retrospection and, therefore, cannot serve as the basis for strong AI. To achieve human level intelligence, learning machines need the guidance of a model of reality, similar to the ones used in causal inference tasks. To demonstrate the essential role of such models, I will present a summary of seven tasks which are beyond reach of current machine learning systems and which have been accomplished using the tools of causal modeling.","id":"50|doi_dedup___::2436e90941a664931b54b956ade5b77b","keywords":"machine learning (cs.lg), artificial intelligence (cs.ai), machine learning (stat.ml), fos: computer and information sciences, mode (statistics), causal inference, artificial intelligence, business.industry, business, power (physics), computer science, machine learning, computer.software_genre, computer, basis (linear algebra), 03 medical and health sciences, 02 engineering and technology, 0202 electrical engineering, electronic engineering, information engineering, 0301 basic medicine, 020201 artificial intelligence & image processing, 030104 developmental biology, computer science - learning, computer science - artificial intelligence, statistics - machine learning","language":"und","publication_date":"2018-02-02","publisher":"arXiv","title":"Theoretical Impediments to Machine Learning With Seven Sparks from the Causal Revolution","type":"publication"} -{"accessright":"OPEN","country":"","description":"In most natural and engineered systems, a set of entities interact with each other in complicated patterns that can encompass multiple types of relationships, change in time, and include other types of complications. Such systems include multiple subsystems and layers of connectivity, and it is important to take such \"multilayer\" features into account to try to improve our understanding of complex systems. Consequently, it is necessary to generalize \"traditional\" network theory by developing (and validating) a framework and associated tools to study multilayer systems in a comprehensive fashion. The origins of such efforts date back several decades and arose in multiple disciplines, and now the study of multilayer networks has become one of the most important directions in network science. In this paper, we discuss the history of multilayer networks (and related concepts) and review the exploding body of work on such networks. To unify the disparate terminology in the large body of recent work, we discuss a general framework for multilayer networks, construct a dictionary of terminology to relate the numerous existing concepts to each other, and provide a thorough discussion that compares, contrasts, and translates between related notions such as multilayer networks, multiplex networks, interdependent networks, networks of networks, and many others. We also survey and discuss existing data sets that can be represented as multilayer networks. We review attempts to generalize single-layer-network diagnostics to multilayer networks. We also discuss the rapidly expanding research on multilayer-network models and notions like community structure, connected components, tensor decompositions, and various types of dynamical processes on multilayer networks. We conclude with a summary and an outlook.","id":"50|doi_dedup___::c5a574592f2e347f27be49d2c20a5558","keywords":"applied mathematics, computational mathematics, control and optimization, management science and operations research, computer networks and communications, data science, connected component, terminology, complex system, network theory, network science, construct (philosophy), computer science, interdependent networks, set (psychology), 01 natural sciences, 0103 physical sciences, 010306 general physics, 010305 fluids & plasmas, physics - physics and society, computer science - social and information networks, physics and society (physics.soc-ph), social and information networks (cs.si), fos: physical sciences, fos: computer and information sciences","language":"und","publication_date":"2013-09-27","publisher":"Oxford University Press (OUP)","title":"Multilayer networks","type":"publication"} \ No newline at end of file +{"accessright":"OPEN","country":"","description":"In most natural and engineered systems, a set of entities interact with each other in complicated patterns that can encompass multiple types of relationships, change in time, and include other types of complications. Such systems include multiple subsystems and layers of connectivity, and it is important to take such \"multilayer\" features into account to try to improve our understanding of complex systems. Consequently, it is necessary to generalize \"traditional\" network theory by developing (and validating) a framework and associated tools to study multilayer systems in a comprehensive fashion. The origins of such efforts date back several decades and arose in multiple disciplines, and now the study of multilayer networks has become one of the most important directions in network science. In this paper, we discuss the history of multilayer networks (and related concepts) and review the exploding body of work on such networks. To unify the disparate terminology in the large body of recent work, we discuss a general framework for multilayer networks, construct a dictionary of terminology to relate the numerous existing concepts to each other, and provide a thorough discussion that compares, contrasts, and translates between related notions such as multilayer networks, multiplex networks, interdependent networks, networks of networks, and many others. We also survey and discuss existing data sets that can be represented as multilayer networks. We review attempts to generalize single-layer-network diagnostics to multilayer networks. We also discuss the rapidly expanding research on multilayer-network models and notions like community structure, connected components, tensor decompositions, and various types of dynamical processes on multilayer networks. We conclude with a summary and an outlook.","id":"50|doi_dedup___::c5a574592f2e347f27be49d2c20a5558","keywords":"applied mathematics, computational mathematics, control and optimization, management science and operations research, computer networks and communications, data science, connected component, terminology, complex system, network theory, network science, construct (philosophy), computer science, interdependent networks, set (psychology), 01 natural sciences, 0103 physical sciences, 010306 general physics, 010305 fluids & plasmas, physics - physics and society, computer science - social and information networks, physics and society (physics.soc-ph), social and information networks (cs.si), fos: physical sciences, fos: computer and information sciences","language":"und","publication_date":"2013-09-27","publisher":"Oxford University Press (OUP)","title":"Multilayer networks","type":"publication"} +{"accessright":"UNKNOWN","country":"","description":"","id":"50|doi_________::715fec7723208e6f17e855c204656e2f","keywords":"","language":"und","publication_date":"1998-10-19","publisher":"American Mathematical\\n Society","title":"Good encodings for DNA-based solutions to combinatorial problems","type":"publication"} \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00000 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00000 index e69de29..23c80f9 100644 --- a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00000 +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00000 @@ -0,0 +1 @@ +50|doi_________::715fec7723208e6f17e855c204656e2f \ No newline at end of file diff --git a/pom.xml b/pom.xml index 7f650b4..ee4ec11 100644 --- a/pom.xml +++ b/pom.xml @@ -102,7 +102,7 @@ 5.6.1 3.5 11.0.2 - [2.12.1] + [2.13.1-patched] \ No newline at end of file From 8a44653dbe63b61d63621c3eed2b7f7265ba6f2e Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 5 Jul 2023 09:58:55 +0200 Subject: [PATCH 14/19] [DumpCSV] fixing issues --- .../dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java index f94ad8f..03a5785 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java @@ -149,9 +149,9 @@ public class SparkDumpResults implements Serializable { ar.setRank(String.valueOf(count)); } } - ar.setFirstName(a.getName()); - ar.setLastName(a.getSurname()); - ar.setFullName(a.getFullname()); + ar.setFirstName(a.getName().replace("\t", " ").replace("\n", " ").replace("\r", " ")); + ar.setLastName(a.getSurname().replace("\t", " ").replace("\n", " ").replace("\r", " ")); + ar.setFullName(a.getFullname().replace("\t", " ").replace("\n", " ").replace("\r", " ")); Tuple2 orcid = getOrcid(a.getPid()); if (Optional.ofNullable(orcid).isPresent()) { ar.setOrcid(orcid._1()); @@ -207,7 +207,7 @@ public class SparkDumpResults implements Serializable { .distinct() .map(p -> { CSVPid ret = new CSVPid(); - ret.setId(DHPUtils.md5(p)); + ret.setId(DHPUtils.md5(p + "@" + resultId)); ret.setResult_id(resultId); ret.setPid(split(p, "@")[1]); ret.setType(split(p, "@")[0]); From 9d1b708a89bf860cd8b8ff8e1390b887560123d3 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 7 Jul 2023 17:44:19 +0200 Subject: [PATCH 15/19] [dumpCSV] addressing the issues fointed out by the Dare Lab people. Repeated relations from author to result due to the author repeated in the data. Repeated relations from result to result due to the same pid present in more that one result. Author table not properly formatted due to the bad formatting of the input data --- .../oa/graph/dump/csv/SparkDumpResults.java | 35 +++++++++++++++---- .../dhp/oa/graph/dump/csv/DumpResultTest.java | 22 ++++++------ .../dhp/oa/graph/dump/csv/input/publication | 3 +- .../dump/csv/working/resultIds/part-00169 | 1 + 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java index 03a5785..4e46979 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java @@ -137,6 +137,7 @@ public class SparkDumpResults implements Serializable { .flatMap((FlatMapFunction) r -> { int count = 0; List arl = new ArrayList<>(); + Set authorIds = new HashSet(); if (Optional.ofNullable(r.getAuthor()).isPresent()) { for (Author a : r.getAuthor()) { count += 1; @@ -149,9 +150,9 @@ public class SparkDumpResults implements Serializable { ar.setRank(String.valueOf(count)); } } - ar.setFirstName(a.getName().replace("\t", " ").replace("\n", " ").replace("\r", " ")); - ar.setLastName(a.getSurname().replace("\t", " ").replace("\n", " ").replace("\r", " ")); - ar.setFullName(a.getFullname().replace("\t", " ").replace("\n", " ").replace("\r", " ")); + ar.setFirstName(replace(a.getName())); + ar.setLastName(replace(a.getSurname())); + ar.setFullName(replace(a.getFullname())); Tuple2 orcid = getOrcid(a.getPid()); if (Optional.ofNullable(orcid).isPresent()) { ar.setOrcid(orcid._1()); @@ -159,7 +160,12 @@ public class SparkDumpResults implements Serializable { } ar.autosetId(); - arl.add(ar); + + if(!authorIds.contains(ar.getAuthorId())){ + arl.add(ar); + authorIds.add(ar.getAuthorId()); + } + } } @@ -187,11 +193,21 @@ public class SparkDumpResults implements Serializable { .mode(SaveMode.Overwrite) .json(workingPath + "/" + resultType + "/result_author"); - // ma the authors in the working dir. I do not want to have them repeated + // ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose the one from orcid if any authorResult .groupByKey((MapFunction) ar -> ar.getAuthorId(), Encoders.STRING()) .mapGroups( - (MapGroupsFunction) (k, it) -> getAuthorDump(it.next()), + (MapGroupsFunction) (k, it) -> { + AuthorResult first = it.next(); + if(!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid()) + return getAuthorDump(first); + while(it.hasNext()){ + AuthorResult ar = it.next(); + if(ar.getFromOrcid()) + return getAuthorDump(ar); + } + return getAuthorDump(first); + }, Encoders.bean(CSVAuthor.class)) .write() .option("compression", "gzip") @@ -200,6 +216,13 @@ public class SparkDumpResults implements Serializable { } + private static String replace(String input){ + if (Optional.ofNullable(input).isPresent()) + return input.replace("\t", " ").replace("\n", " ").replace("\r", " ").replace("\"", " "); + else + return ""; + } + private static List mapPid(List pid, String resultId) { return pid .stream() diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java index 4f3c4f9..ee58ef9 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java @@ -10,6 +10,7 @@ import java.nio.file.Path; import java.util.HashMap; import java.util.Optional; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -117,7 +118,7 @@ public class DumpResultTest { tmp.show(false); - Assertions.assertEquals(4, tmp.count()); + Assertions.assertEquals(5, tmp.count()); CSVResult row = tmp .filter( (FilterFunction) r -> r.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) @@ -196,7 +197,7 @@ public class DumpResultTest { SparkDumpResults.main(new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-outputPath", workingDir.toString() + "/output", + "-workingPath", workingDir.toString() + "/working", "-resultType", "publication", "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", @@ -205,26 +206,23 @@ public class DumpResultTest { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - Dataset tmp = spark - .read() - .option("header", "true") - .option("delimiter", Constants.SEP) - .csv(workingDir.toString() + "/working/publication/author"); + Dataset tmp = Utils + .readPath(spark, workingDir.toString() + "/working/publication/author", CSVAuthor.class); - Assertions.assertEquals(5, tmp.count()); + Assertions.assertEquals(13, tmp.count()); Assertions.assertEquals(1, tmp.where("firstName == 'Maryam'").count()); Assertions .assertEquals( DHPUtils.md5("50|DansKnawCris::0224aae28af558f21768dbc6439c7a951"), - tmp.where("firstName == 'Maryam'").first().getAs("id")); + tmp.where("firstName == 'Maryam'").first().getId()); Assertions - .assertEquals(DHPUtils.md5("0000-0003-2914-2734"), tmp.where("firstName == 'Michael'").first().getAs("id")); + .assertEquals(DHPUtils.md5("0000-0003-2914-2734"), tmp.where("firstName == 'Michael'").first().getId()); Assertions .assertEquals( - DHPUtils.md5("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d92"), - tmp.where("firstName == 'Mikhail'").first().getAs("id")); + DHPUtils.md5("0000-0002-6660-5673"), + tmp.where("firstName == 'Mikhail'").first().getId()); } diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication index 9ccfc32..0148148 100644 --- a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/input/publication @@ -2,4 +2,5 @@ {"author":[{"affiliation":[],"fullname":"Blondin, Michael","name":"Michael","pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0003-2914-2734"}],"rank":1,"surname":"Blondin"},{"affiliation":[],"fullname":"Raskin, Mikhail","name":"Mikhail","pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid_pending","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0002-6660-5673"}],"rank":2,"surname":"Raskin"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[{"classid":"IT","classname":"Finland","dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"country:instrepos","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"schemeid":"dnet:countries","schemename":"dnet:countries"},{"classid":"FI","classname":"Finland","dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"country:instrepos","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"schemeid":"dnet:countries","schemename":"dnet:countries"}],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:03:57.761Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591283087415,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T07:58:39Z","harvestDate":"2020-05-25T11:34:38.707Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce-kb:document:800020324","metadataNamespace":""}},"originalId":["DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceFullCatalogue&search=priref=800020324"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} {"author":[{"affiliation":[],"fullname":"Ward, Mark Daniel","name":"Mark Daniel","pid":[],"rank":1,"surname":"Ward"},{"affiliation":[],"fullname":"Szpankowski, Wojciech","name":"Wojciech","pid":[],"rank":2,"surname":"Szpankowski"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"ni"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d808245","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} {"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[{"dataInfo":[{"deletedbyinference":false,"inferenceprovenance":"bulktagging","inferred":true,"invisible":false,"provenanceaction":{"classid":"community:subject","classname":"Bulktagging for Community - Subject","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":null}],"id":"enermaps"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":null,"dateofcollection":"","dateoftransformation":"2020-05-25T17:13:23.976Z","description":[],"embargoenddate":null,"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|DansKnawCris::26780065282e607306372abd0d80fake","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"dateofacceptance":null,"distributionlocation":"","hostedby":{"dataInfo":null,"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0017","classname":"Report","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":null,"processingchargeamount":null,"processingchargecurrency":null,"refereed":null,"url":null}],"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591282897527,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2019-12-01T08:42:33Z","harvestDate":"2020-05-25T11:40:10.845Z","identifier":"oai:services.nod.dans.knaw.nl:Publications/rce:document:550053196","metadataNamespace":""}},"originalId":["DansKnawCris::26780065282e607306372abd0d808245"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"http://cultureelerfgoed.adlibsoft.com/dispatcher.aspx?action=search&database=ChoiceRapporten&search=priref=550053196"}],"publisher":null,"relevantdate":[],"resourcetype":{"classid":"0017","classname":"0017","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"archeologie"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"None"}],"journal":null} -{"dataInfo": {"invisible": false, "trust": "0.9", "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "deletedbyinference": false}, "resourcetype": {"classid": "0013", "classname": "Part of book or chapter of book", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}, "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1090/dimacs/044/20"}], "contributor": [], "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "created", "classname": "created", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2017-04-27"}, {"qualifier": {"classid": "published-print", "classname": "published-print", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "1998-10-19"}], "collectedfrom": [{"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}], "id":"50|doi_________::715fec7723208e6f17e855c204656e2f", "subject": [], "dateofacceptance": {"value": "1998-10-19"}, "lastupdatetimestamp": 1675978002598, "author": [{"surname": "Deaton", "fullname": "R. Deaton", "pid": [], "name": "R.", "rank": 1}, {"surname": "Murphy", "fullname": "R. Murphy", "pid": [], "name": "R.", "rank": 2}, {"surname": "Garzon", "fullname": "M. Garzon", "pid": [], "name": "M.", "rank": 3}, {"surname": "Franceschetti", "fullname": "D. Franceschetti", "pid": [], "name": "D.", "rank": 4}, {"surname": "Stevens", "fullname": "S. Stevens", "pid": [], "name": "S.", "rank": 5}], "instance": [{"refereed": {"classid": "0000", "classname": "UNKNOWN", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "collectedfrom": {"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}, "hostedby": {"dataInfo": {"invisible": false, "deletedbyinference": false}, "value": "Unknown Repository", "key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "dateofacceptance": {"value": "1998-10-19"}, "url": ["https://doi.org/10.1090/dimacs/044/20"], "measures": [{"id": "influence", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "1.9184702E-8", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "5.79069E-9", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "influence_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "51", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "0.8491071", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "impulse", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "2", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}], "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1090/dimacs/044/20"}], "instancetype": {"classid": "0013", "classname": "Part of book or chapter of book", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "dateofcollection": "2023-02-09T21:26:42Z", "fulltext": [], "description": [], "format": [], "measures": [{"id": "influence", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "1.7008906E-8", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "5.1452145E-9", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "influence_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "51", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "0.50946426", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "impulse", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "2", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}], "coverage": [], "externalReference": [], "publisher": {"value": "American Mathematical\\n Society"}, "eoscifguidelines": [], "language": {"classid": "und", "classname": "Undetermined", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["10.1090/dimacs/044/20", "50|doiboost____::715fec7723208e6f17e855c204656e2f"], "source": [{"value": "Crossref"}], "context": [], "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Good encodings for DNA-based solutions to combinatorial problems"}]} \ No newline at end of file +{"dataInfo": {"invisible": false, "trust": "0.9", "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "deletedbyinference": false}, "resourcetype": {"classid": "0013", "classname": "Part of book or chapter of book", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}, "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1090/dimacs/044/20"}], "contributor": [], "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "created", "classname": "created", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2017-04-27"}, {"qualifier": {"classid": "published-print", "classname": "published-print", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "1998-10-19"}], "collectedfrom": [{"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}], "id":"50|doi_________::715fec7723208e6f17e855c204656e2f", "subject": [], "dateofacceptance": {"value": "1998-10-19"}, "lastupdatetimestamp": 1675978002598, "author": [{"surname": "Deaton", "fullname": "R. Deaton", "pid": [], "name": "R.", "rank": 1}, {"surname": "Murphy", "fullname": "R. Murphy", "pid": [], "name": "R.", "rank": 2}, {"surname": "Garzon", "fullname": "M. Garzon", "pid": [], "name": "M.", "rank": 3}, {"surname": "Franceschetti", "fullname": "D. Franceschetti", "pid": [], "name": "D.", "rank": 4}, {"surname": "Stevens", "fullname": "S. Stevens", "pid": [], "name": "S.", "rank": 5}], "instance": [{"refereed": {"classid": "0000", "classname": "UNKNOWN", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "collectedfrom": {"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}, "hostedby": {"dataInfo": {"invisible": false, "deletedbyinference": false}, "value": "Unknown Repository", "key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "dateofacceptance": {"value": "1998-10-19"}, "url": ["https://doi.org/10.1090/dimacs/044/20"], "measures": [{"id": "influence", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "1.9184702E-8", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "5.79069E-9", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "influence_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "51", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "0.8491071", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "impulse", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "2", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}], "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.1090/dimacs/044/20"}], "instancetype": {"classid": "0013", "classname": "Part of book or chapter of book", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "dateofcollection": "2023-02-09T21:26:42Z", "fulltext": [], "description": [], "format": [], "measures": [{"id": "influence", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "1.7008906E-8", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "5.1452145E-9", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "influence_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "51", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "popularity_alt", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "0.50946426", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}, {"id": "impulse", "unit": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "2", "key": "score"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "measure:bip", "classname": "measure:bip", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "value": "C", "key": "class"}]}], "coverage": [], "externalReference": [], "publisher": {"value": "American Mathematical\\n Society"}, "eoscifguidelines": [], "language": {"classid": "und", "classname": "Undetermined", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["10.1090/dimacs/044/20", "50|doiboost____::715fec7723208e6f17e855c204656e2f"], "source": [{"value": "Crossref"}], "context": [], "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Good encodings for DNA-based solutions to combinatorial problems"}]} +{"author":[{"affiliation":[{"value":"Royal Institute of Technology"}],"fullname":"Athina Tympakianaki","pid":[{"qualifier":{"classid":"URL","classname":"URL","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"https://academic.microsoft.com/#/detail/2746890261"}],"rank":1},{"affiliation":[{"value":"Northeastern University"}],"fullname":"Haris N. Koutsopoulos","pid":[{"qualifier":{"classid":"URL","classname":"URL","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"https://academic.microsoft.com/#/detail/1543483944"}],"rank":2},{"affiliation":[{"value":"Royal Institute of Technology"}],"fullname":"Haris N. Koutsopoulos","pid":[{"qualifier":{"classid":"URL","classname":"URL","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"https://academic.microsoft.com/#/detail/1543483944"}],"rank":2},{"affiliation":[{"value":"Royal Institute of Technology"}],"fullname":"Erik Jenelius","pid":[{"qualifier":{"classid":"URL","classname":"URL","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"https://academic.microsoft.com/#/detail/44823834"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.91"},"qualifier":{"classid":"orcid","classname":"Open Researcher and Contributor ID","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0002-4106-3126"}],"rank":3}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2","value":"Crossref"},{"key":"10|openaire____::8ac8380272269217cb09a928c8caa993","value":"UnpayWall"},{"key":"10|fairsharing_::cd0f74b5955dc87fd0605745c4b49ee8","value":"Open Researcher and Contributor ID Registry"},{"key":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a","value":"Microsoft Academic Graph"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:actionset","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"value":"2018-01-01"},"dateofcollection":"2023-05-12T17:10:38Z","description":[{"value":"The Simultaneous Perturbation Stochastic Approximation (SPSA) algorithm has been used for solving the off-line dynamic origin-destination (OD) estimation problem. While the algorithm can be used wi ..."}],"eoscifguidelines":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|doi_________::16e142b54fbddb2cf1c71ff7460e2792","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","openAccessRoute":"gold","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"key":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2","value":"Crossref"},"dateofacceptance":{"value":"2018-01-01"},"hostedby":{"key":"10|issn___print::8e5fa0b3dde7aa9c08716c4705189ead","value":"Procedia Computer Science"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"value":"http://creativecommons.org/licenses/by-nc-nd/4.0/"},"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"6.5640458E-9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"2.2657568E-8"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]},{"id":"influence_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"11"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"3.9216"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]},{"id":"impulse","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]}],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1016/j.procs.2018.04.012"}],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://doi.org/10.1016/j.procs.2018.04.012"]}],"journal":{"ep":"64","issnPrinted":"1877-0509","name":"Procedia Computer Science","sp":"57","vol":"130"},"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1683911438408,"measures":[{"id":"influence","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"5.819284E-9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]},{"id":"popularity","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"1.9254836E-8"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]},{"id":"influence_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"11"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]},{"id":"popularity_alt","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"2.35296"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]},{"id":"impulse","unit":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"score","value":"9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"measure:bip","classname":"measure:bip","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"key":"class","value":"C"}]}],"originalId":["S1877050918303624","10.1016/j.procs.2018.04.012","50|doiboost____::16e142b54fbddb2cf1c71ff7460e2792","2786369073"],"pid":[{"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1016/j.procs.2018.04.012"}],"publisher":{"value":"Elsevier BV"},"relevantdate":[{"qualifier":{"classid":"created","classname":"created","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2018-04-24"},{"qualifier":{"classid":"published-print","classname":"published-print","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2018-01-01"}],"resourcetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"value":"Crossref"},{"value":"ANT/SEIT"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"subject:fos","classname":"subject:fos","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"qualifier":{"classid":"FOS","classname":"Fields of Science and Technology classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"050210 logistics & transportation"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"subject:fos","classname":"subject:fos","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"qualifier":{"classid":"FOS","classname":"Fields of Science and Technology classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"021103 operations research"},{"qualifier":{"classid":"MAG","classname":"Microsoft Academic Graph classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Computer science"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"subject:fos","classname":"subject:fos","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"qualifier":{"classid":"FOS","classname":"Fields of Science and Technology classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"05 social sciences"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_classes","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.891"},"qualifier":{"classid":"ACM","classname":"ACM Computing Classification System","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"MathematicsofComputing_NUMERICALANALYSIS"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"subject:fos","classname":"subject:fos","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"qualifier":{"classid":"FOS","classname":"Fields of Science and Technology classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"0211 other engineering and technologies"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"subject:fos","classname":"subject:fos","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"qualifier":{"classid":"FOS","classname":"Fields of Science and Technology classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"02 engineering and technology"},{"qualifier":{"classid":"MAG","classname":"Microsoft Academic Graph classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Matrix estimation"},{"qualifier":{"classid":"MAG","classname":"Microsoft Academic Graph classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Simultaneous perturbation stochastic approximation"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"update","inferred":true,"invisible":false,"provenanceaction":{"classid":"subject:fos","classname":"subject:fos","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":""},"qualifier":{"classid":"FOS","classname":"Fields of Science and Technology classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"0502 economics and business"},{"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"General Earth and Planetary Sciences"},{"qualifier":{"classid":"MAG","classname":"Microsoft Academic Graph classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Algorithm"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_classes","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.7245"},"qualifier":{"classid":"arxiv","classname":"arXiv","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Computer Science::Databases"},{"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"General Environmental Science"}],"title":[{"qualifier":{"classid":"alternative title","classname":"alternative title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Robust SPSA algorithms for dynamic OD matrix estimation"},{"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Robust SPSA algorithms for dynamic OD matrix estimation"}]}' \ No newline at end of file diff --git a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00169 b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00169 index 7ceab26..e7e15ab 100644 --- a/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00169 +++ b/dump/src/test/resources/eu/dnetlib/dhp/oa/graph/dump/csv/working/resultIds/part-00169 @@ -1 +1,2 @@ 50|DansKnawCris::26780065282e607306372abd0d808245 +50|doi_________::16e142b54fbddb2cf1c71ff7460e2792 From 3bfac8bc6ed497f5b64f560c71a3389bbcfbd854 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 7 Jul 2023 18:01:26 +0200 Subject: [PATCH 16/19] [dumpCSV] addressing the issues fointed out by the Dare Lab people. Repeated relations from author to result due to the author repeated in the data. Repeated relations from result to result due to the same pid present in more that one result. Author table not properly formatted due to the bad formatting of the input data --- .../dhp/oa/graph/dump/csv/SparkDumpResults.java | 13 +++++++------ .../dhp/oa/graph/dump/csv/DumpResultTest.java | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java index 4e46979..ca91bd8 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java @@ -161,7 +161,7 @@ public class SparkDumpResults implements Serializable { ar.autosetId(); - if(!authorIds.contains(ar.getAuthorId())){ + if (!authorIds.contains(ar.getAuthorId())) { arl.add(ar); authorIds.add(ar.getAuthorId()); } @@ -193,17 +193,18 @@ public class SparkDumpResults implements Serializable { .mode(SaveMode.Overwrite) .json(workingPath + "/" + resultType + "/result_author"); - // ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose the one from orcid if any + // ma the authors in the working dir. I do not want to have them repeated. If I have an orcid as id, I choose + // the one from orcid if any authorResult .groupByKey((MapFunction) ar -> ar.getAuthorId(), Encoders.STRING()) .mapGroups( (MapGroupsFunction) (k, it) -> { AuthorResult first = it.next(); - if(!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid()) + if (!Optional.ofNullable(first.getFromOrcid()).isPresent() || first.getFromOrcid()) return getAuthorDump(first); - while(it.hasNext()){ + while (it.hasNext()) { AuthorResult ar = it.next(); - if(ar.getFromOrcid()) + if (ar.getFromOrcid()) return getAuthorDump(ar); } return getAuthorDump(first); @@ -216,7 +217,7 @@ public class SparkDumpResults implements Serializable { } - private static String replace(String input){ + private static String replace(String input) { if (Optional.ofNullable(input).isPresent()) return input.replace("\t", " ").replace("\n", " ").replace("\r", " ").replace("\"", " "); else diff --git a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java index ee58ef9..e285098 100644 --- a/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java +++ b/dump/src/test/java/eu/dnetlib/dhp/oa/graph/dump/csv/DumpResultTest.java @@ -10,7 +10,6 @@ import java.nio.file.Path; import java.util.HashMap; import java.util.Optional; -import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -35,6 +34,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.oa.graph.dump.Utils; +import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVAuthor; import eu.dnetlib.dhp.oa.graph.dump.csv.model.CSVResult; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Publication; @@ -207,7 +207,7 @@ public class DumpResultTest { final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); Dataset tmp = Utils - .readPath(spark, workingDir.toString() + "/working/publication/author", CSVAuthor.class); + .readPath(spark, workingDir.toString() + "/working/publication/author", CSVAuthor.class); Assertions.assertEquals(13, tmp.count()); From baef25560a4203692fd2fe76aa232508fec70678 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 11 Jul 2023 13:47:28 +0200 Subject: [PATCH 17/19] [dumpCSV] align pom version with master for graph --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index ee4ec11..00b5e97 100644 --- a/pom.xml +++ b/pom.xml @@ -102,7 +102,7 @@ 5.6.1 3.5 11.0.2 - [2.13.1-patched] + [3.17.1] \ No newline at end of file From b01573e2015ea39299a04f68bde2d0198160fce9 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 12 Jul 2023 07:38:53 +0200 Subject: [PATCH 18/19] [dumpCSV] removed output directory before starting the jobs --- ...iriam.baglioni@hadoop-edge3.garr-pa1.d4science.org | 5 +++++ .../eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java | 1 + .../dump/csv/SparkSelectResultsAndDumpRelations.java | 2 +- .../dhp/oa/graph/dump/csv/oozie_app/workflow.xml | 11 ++++++++++- 4 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 dump/miriam.baglioni@hadoop-edge3.garr-pa1.d4science.org diff --git a/dump/miriam.baglioni@hadoop-edge3.garr-pa1.d4science.org b/dump/miriam.baglioni@hadoop-edge3.garr-pa1.d4science.org new file mode 100644 index 0000000..41a91b3 --- /dev/null +++ b/dump/miriam.baglioni@hadoop-edge3.garr-pa1.d4science.org @@ -0,0 +1,5 @@ +id name acronym description +04a00617ca659adc944977ac700ea14b Digital Humanities and Cultural Heritage dh-ch This community gathers research results, data, scientific publications and projects related to the domain of Digital Humanities. This broad definition includes Humanities, Cultural Heritage, History, Archaeology and related fields. +3ee95893613de7450247d7fef747136f DARIAH EU dariah The Digital Research Infrastructure for the Arts and Humanities (DARIAH) aims to enhance and support digitally-enabled research and teaching across the arts and humanities. It develops, maintains and operates an infrastructure in support of ICT-based research practices and sustains researchers in using them to build, analyse and interpret digital resources. DARIAH was established as a European Research Infrastructure Consortium (ERIC) in August 2014. Currently, DARIAH has 18 Members and several cooperating partners in eight non-member countries. Here you will find a growing collection of DARIAH-affiliated research outputs and other documents. +5fde864866ea5ded4cc873b3170b63c3 Transport Research beopen Welcome to the Open Research Gateway for Transport Research. This gateway is part of the TOPOS Observatory (https://www.topos-observatory.eu). The TOPOS aims to showcase the status and progress of open science uptake in transport research. It focuses on promoting territorial and cross border cooperation and contributing in the optimization of open science in transport research. The TOPOS Observatory is supported by the EC H2020 BEOPEN project (824323) +aa0e56dd2e9d2a0be749f5debdd2b3d8 Energy Research enermaps

EnerMaps Open Data Management Tool aims to  improve data management  and  accessibility  in the field of  energy research  for the  renewable energy industry.

EnerMaps’ tool accelerates and facilitates the energy transition offering a qualitative and user-friendly digital platform to the energy professionals.

The project is based on the  FAIR data principle  which requires data to be  Findable,  Accessible,  Interoperable and  Reusable.

EnerMaps project  coordinates and enriches existing energy databases to promote  trans-disciplinary research  and to develop partnerships between researchers and the energy professionals.

The EnerMaps project has received funding from the European Union’s Horizon 2020 research and innovation program under   grant agreement N°884161

 

Website:  https://enermaps.eu/ 

diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java index 9414cda..625d6b0 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java @@ -21,6 +21,7 @@ import eu.dnetlib.dhp.oa.model.Indicator; import eu.dnetlib.dhp.oa.model.Instance; import eu.dnetlib.dhp.oa.model.OpenAccessRoute; import eu.dnetlib.dhp.oa.model.Result; +import eu.dnetlib.dhp.oa.model.Subject; import eu.dnetlib.dhp.oa.model.community.CfHbKeyValue; import eu.dnetlib.dhp.oa.model.community.CommunityInstance; import eu.dnetlib.dhp.oa.model.community.CommunityResult; diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java index 522ce8b..30772fa 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkSelectResultsAndDumpRelations.java @@ -75,7 +75,7 @@ public class SparkSelectResultsAndDumpRelations implements Serializable { conf, isSparkSessionManaged, spark -> { - Utils.removeOutputDir(spark, outputPath + RESULT_COMMUNITY_TABLE); + // Utils.removeOutputDir(spark, outputPath); run(spark, inputPath, outputPath, workingPath, finalCommunityList); }); diff --git a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml index 543de1c..bacf0a3 100644 --- a/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml +++ b/dump/src/main/resources/eu/dnetlib/dhp/oa/graph/dump/csv/oozie_app/workflow.xml @@ -65,7 +65,16 @@ - + + + + + + + + + + From 5ff50d115a7dff8a085429404291811f950ad75c Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 17 Jul 2023 16:21:20 +0200 Subject: [PATCH 19/19] [dumpCSV] ading double quotes enclosing all the fileds --- .../dhp/oa/graph/dump/csv/Constants.java | 8 ++++++ .../oa/graph/dump/csv/SparkDumpResults.java | 16 +++++++----- .../oa/graph/dump/csv/model/CSVAuthor.java | 12 +++++---- .../oa/graph/dump/csv/model/CSVCitation.java | 8 +++--- .../dhp/oa/graph/dump/csv/model/CSVPid.java | 10 +++++--- .../dump/csv/model/CSVRELCommunityResult.java | 6 +++-- .../oa/graph/dump/csv/model/CSVRelResAut.java | 6 +++-- .../oa/graph/dump/csv/model/CSVResult.java | 25 +++++++++++-------- 8 files changed, 59 insertions(+), 32 deletions(-) diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constants.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constants.java index cfb1fd7..24fff80 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constants.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/Constants.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.graph.dump.csv; +import org.apache.commons.lang.StringUtils; + import java.io.Serializable; /** @@ -9,4 +11,10 @@ import java.io.Serializable; */ public class Constants implements Serializable { public final static String SEP = "\t"; + + public static final String addQuotes(String id) { + if(StringUtils.isNotEmpty(id)) + return "\"" + id + "\""; + return id; + } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java index ca91bd8..f2d1e73 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/SparkDumpResults.java @@ -290,13 +290,13 @@ public class SparkDumpResults implements Serializable { private static CSVResult mapResultInfo(R r) { CSVResult ret = new CSVResult(); - ret.setId(r.getId()); - ret.setType(r.getResulttype().getClassid()); + ret.setId(removeBreaks(r.getId())); + ret.setType(removeBreaks(r.getResulttype().getClassid())); ret.setTitle(getTitle(r.getTitle())); ret.setDescription(getAbstract(r.getDescription())); - ret.setAccessright(r.getBestaccessright().getClassid()); - ret.setPublication_date(getFieldValue(r.getDateofacceptance())); - ret.setPublisher(getFieldValue(r.getPublisher())); + ret.setAccessright(removeBreaks(r.getBestaccessright().getClassid())); + ret.setPublication_date(removeBreaks(getFieldValue(r.getDateofacceptance()))); + ret.setPublisher(removeBreaks(getFieldValue(r.getPublisher()))); if (Optional.ofNullable(r.getSubject()).isPresent()) ret.setKeywords(String.join(", ", r.getSubject().stream().map(s -> { @@ -354,7 +354,11 @@ public class SparkDumpResults implements Serializable { } private static String removeBreaks(String input) { - return input.replace("\n", " ").replace("\t", " ").replace("\r", " "); + return input.replace("\n", " ").replace("\t", " ") + .replace("\r", " ") + .replace("\\\"", " ") + .replace("\"", " ") + ; } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java index 101ce33..c3057e9 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVAuthor.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.graph.dump.csv.model; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; + import java.io.Serializable; /** @@ -28,7 +30,7 @@ public class CSVAuthor implements Serializable { } public void setId(String id) { - this.id = id; + this.id = Constants.addQuotes(id); } public String getFirstname() { @@ -36,7 +38,7 @@ public class CSVAuthor implements Serializable { } public void setFirstname(String firstname) { - this.firstname = firstname; + this.firstname = Constants.addQuotes(firstname); } public String getLastname() { @@ -44,7 +46,7 @@ public class CSVAuthor implements Serializable { } public void setLastname(String lastname) { - this.lastname = lastname; + this.lastname = Constants.addQuotes(lastname); } public String getFullname() { @@ -52,7 +54,7 @@ public class CSVAuthor implements Serializable { } public void setFullname(String fullname) { - this.fullname = fullname; + this.fullname = Constants.addQuotes(fullname); } public String getOrcid() { @@ -60,7 +62,7 @@ public class CSVAuthor implements Serializable { } public void setOrcid(String orcid) { - this.orcid = orcid; + this.orcid = Constants.addQuotes(orcid); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java index d7b54e3..9b0fe14 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVCitation.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.graph.dump.csv.model; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; + import java.io.Serializable; /** @@ -17,7 +19,7 @@ public class CSVCitation implements Serializable { } public void setId(String id) { - this.id = id; + this.id = Constants.addQuotes(id); } public String getResult_id_cites() { @@ -25,7 +27,7 @@ public class CSVCitation implements Serializable { } public void setResult_id_cites(String result_id_cites) { - this.result_id_cites = result_id_cites; + this.result_id_cites = Constants.addQuotes(result_id_cites); } public String getResult_id_cited() { @@ -33,6 +35,6 @@ public class CSVCitation implements Serializable { } public void setResult_id_cited(String result_id_cited) { - this.result_id_cited = result_id_cited; + this.result_id_cited = Constants.addQuotes(result_id_cited); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java index 873e8f7..c61ae0c 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVPid.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.graph.dump.csv.model; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; + import java.io.Serializable; /** @@ -19,7 +21,7 @@ public class CSVPid implements Serializable { } public void setResult_id(String result_id) { - this.result_id = result_id; + this.result_id = Constants.addQuotes(result_id); } public String getPid() { @@ -27,7 +29,7 @@ public class CSVPid implements Serializable { } public void setPid(String pid) { - this.pid = pid; + this.pid = Constants.addQuotes(pid); } public String getType() { @@ -35,7 +37,7 @@ public class CSVPid implements Serializable { } public void setType(String type) { - this.type = type; + this.type = Constants.addQuotes(type); } public String getId() { @@ -43,6 +45,6 @@ public class CSVPid implements Serializable { } public void setId(String id) { - this.id = id; + this.id = Constants.addQuotes(id); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java index bf81fce..bbc0a9a 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRELCommunityResult.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.graph.dump.csv.model; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; + import java.io.Serializable; /** @@ -16,7 +18,7 @@ public class CSVRELCommunityResult implements Serializable { } public void setResult_id(String result_id) { - this.result_id = result_id; + this.result_id = Constants.addQuotes(result_id); } public String getCommunity_id() { @@ -24,6 +26,6 @@ public class CSVRELCommunityResult implements Serializable { } public void setCommunity_id(String community_id) { - this.community_id = community_id; + this.community_id = Constants.addQuotes(community_id); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java index 610668e..1b334c8 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVRelResAut.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.graph.dump.csv.model; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; + import java.io.Serializable; /** @@ -16,7 +18,7 @@ public class CSVRelResAut implements Serializable { } public void setResult_id(String result_id) { - this.result_id = result_id; + this.result_id = Constants.addQuotes(result_id); } public String getAuthor_id() { @@ -24,6 +26,6 @@ public class CSVRelResAut implements Serializable { } public void setAuthor_id(String author_id) { - this.author_id = author_id; + this.author_id = Constants.addQuotes(author_id); } } diff --git a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java index 1baee9b..da8a78c 100644 --- a/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java +++ b/dump/src/main/java/eu/dnetlib/dhp/oa/graph/dump/csv/model/CSVResult.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.graph.dump.csv.model; import java.io.Serializable; +import eu.dnetlib.dhp.oa.graph.dump.csv.Constants; import org.apache.commons.lang.StringUtils; import com.fasterxml.jackson.annotation.JsonGetter; @@ -11,6 +12,7 @@ import com.fasterxml.jackson.annotation.JsonSetter; import eu.dnetlib.dhp.schema.oaf.Country; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import sun.swing.StringUIClientPropertyKey; /** * @author miriam.baglioni @@ -33,15 +35,16 @@ public class CSVResult implements Serializable { } public void setId(String id) { - this.id = id; + this.id = Constants.addQuotes(id); } + public String getType() { return type; } public void setType(String type) { - this.type = type; + this.type = Constants.addQuotes(type); } public String getTitle() { @@ -49,7 +52,8 @@ public class CSVResult implements Serializable { } public void setTitle(String title) { - this.title = title; + this.title = + Constants.addQuotes(title); } public String getDescription() { @@ -57,7 +61,8 @@ public class CSVResult implements Serializable { } public void setDescription(String description) { - this.description = description; + this.description = + Constants.addQuotes(description); } public String getAccessright() { @@ -65,7 +70,7 @@ public class CSVResult implements Serializable { } public void setAccessright(String accessright) { - this.accessright = accessright; + this.accessright = Constants.addQuotes(accessright); } public String getPublication_date() { @@ -73,7 +78,7 @@ public class CSVResult implements Serializable { } public void setPublication_date(String publication_date) { - this.publication_date = publication_date; + this.publication_date = Constants.addQuotes(publication_date); } public String getPublisher() { @@ -81,7 +86,7 @@ public class CSVResult implements Serializable { } public void setPublisher(String publisher) { - this.publisher = publisher; + this.publisher = Constants.addQuotes(publisher); } public String getKeywords() { @@ -89,7 +94,7 @@ public class CSVResult implements Serializable { } public void setKeywords(String keywords) { - this.keywords = keywords; + this.keywords = Constants.addQuotes(keywords); } public String getCountry() { @@ -97,7 +102,7 @@ public class CSVResult implements Serializable { } public void setCountry(String country) { - this.country = country; + this.country = Constants.addQuotes(country); } public String getLanguage() { @@ -105,7 +110,7 @@ public class CSVResult implements Serializable { } public void setLanguage(String language) { - this.language = language; + this.language = Constants.addQuotes(language); } }