forked from D-Net/dnet-hadoop
Compare commits
65 Commits
master
...
singleComm
Author | SHA1 | Date |
---|---|---|
Miriam Baglioni | 8981a82011 | |
Miriam Baglioni | eb0762622c | |
Miriam Baglioni | a469d79b84 | |
Miriam Baglioni | 251178aca8 | |
Miriam Baglioni | 7cf1f49d5e | |
Miriam Baglioni | 7465fa3f20 | |
Miriam Baglioni | bc501f41f6 | |
Miriam Baglioni | 80a7170794 | |
Miriam Baglioni | 08e731916b | |
Miriam Baglioni | 50d13a1d74 | |
Miriam Baglioni | 8c4c74a640 | |
Miriam Baglioni | 6179deb836 | |
Miriam Baglioni | 04a0d1ba6e | |
Miriam Baglioni | 6b51b69cf7 | |
Miriam Baglioni | bd4b6b053d | |
Miriam Baglioni | 26b34201ec | |
Miriam Baglioni | 3d94c12d6e | |
Miriam Baglioni | 95c5f97259 | |
Miriam Baglioni | eaf86828e6 | |
Miriam Baglioni | c58206c3ba | |
Miriam Baglioni | 3e3a45d930 | |
Miriam Baglioni | 46a322b770 | |
Miriam Baglioni | f95ec49a59 | |
Miriam Baglioni | c52355b516 | |
Miriam Baglioni | e1af14833d | |
Miriam Baglioni | 22f4930479 | |
Miriam Baglioni | 7f9b7cfcf6 | |
Miriam Baglioni | 66d64947af | |
Miriam Baglioni | ad6d0ca9eb | |
Miriam Baglioni | 26cf32c066 | |
Miriam Baglioni | 5022f1b50d | |
Miriam Baglioni | 0421f5e1d8 | |
Miriam Baglioni | 2c209e1140 | |
Miriam Baglioni | b3f02083e7 | |
Miriam Baglioni | 8d28ca9815 | |
Miriam Baglioni | 152ba8e2ef | |
Miriam Baglioni | c0c225f3b2 | |
Miriam Baglioni | daabc370c5 | |
Miriam Baglioni | f93356f690 | |
Miriam Baglioni | f7714645d2 | |
Miriam Baglioni | 4632795f25 | |
Miriam Baglioni | 870ee28dd6 | |
Miriam Baglioni | 08f8dd9454 | |
Miriam Baglioni | e5463fea01 | |
Miriam Baglioni | 16c1a27852 | |
Miriam Baglioni | d0c94462e4 | |
Miriam Baglioni | a896febc02 | |
Miriam Baglioni | 5dea729de3 | |
Miriam Baglioni | 200e7e9c46 | |
Miriam Baglioni | 931b2a2e15 | |
Miriam Baglioni | 330343937c | |
Miriam Baglioni | defbb71561 | |
Miriam Baglioni | 17049f8bde | |
Miriam Baglioni | cc11ee1cb9 | |
Miriam Baglioni | 871e5bea29 | |
Miriam Baglioni | 5d92df0627 | |
Miriam Baglioni | 9841086ef3 | |
Miriam Baglioni | d4ad740c98 | |
Miriam Baglioni | a684e1065e | |
Miriam Baglioni | f7c35e6311 | |
Miriam Baglioni | 9bdadd4ddb | |
Miriam Baglioni | 0d76e039cf | |
Miriam Baglioni | 7c86e66697 | |
Miriam Baglioni | bc09d37e8c | |
Miriam Baglioni | 815c7c11aa |
|
@ -90,6 +90,13 @@ public class MakeTarArchive implements Serializable {
|
|||
String p_string = p.toString();
|
||||
if (!p_string.endsWith("_SUCCESS")) {
|
||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
if (name.startsWith("part-") & name.length() > 10) {
|
||||
String tmp = name.substring(0, 10);
|
||||
if (name.contains(".")) {
|
||||
tmp += name.substring(name.indexOf("."));
|
||||
}
|
||||
name = tmp;
|
||||
}
|
||||
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
|
||||
entry.setSize(fileStatus.getLen());
|
||||
current_size += fileStatus.getLen();
|
||||
|
|
|
@ -4,7 +4,7 @@ package eu.dnetlib.dhp.schema.dump.oaf;
|
|||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Used to refer to the Article Processing Charge information. Not dumped in this release. It contains two parameters: -
|
||||
* Used to refer to the Article Processing Charge information. It contains two parameters: -
|
||||
* currency of type String to store the currency of the APC - amount of type String to stores the charged amount
|
||||
*/
|
||||
public class APC implements Serializable {
|
||||
|
|
|
@ -12,9 +12,11 @@ import java.util.List;
|
|||
* type of type String to store the type of the instance as defined in the corresponding dnet vocabulary
|
||||
* (dnet:pubication_resource). It corresponds to the instancetype.classname of the instance to be mapped - url of type
|
||||
* List<String> list of locations where the instance is accessible. It corresponds to url of the instance to be dumped -
|
||||
* publicationdate of type String to store the publication date of the instance ;// dateofacceptance; - refereed of type
|
||||
* String to store information abour tthe review status of the instance. Possible values are 'Unknown',
|
||||
* publicationdate of type String to store the publication date of the instance ;// dateofacceptance;
|
||||
* - refereed of type
|
||||
* String to store information abour the review status of the instance. Possible values are 'Unknown',
|
||||
* 'nonPeerReviewed', 'peerReviewed'. It corresponds to refereed.classname of the instance to be dumped
|
||||
* - articleprocessingcharge of type APC to store the article processing charges possibly associated to the instance
|
||||
*/
|
||||
public class Instance implements Serializable {
|
||||
|
||||
|
@ -28,6 +30,8 @@ public class Instance implements Serializable {
|
|||
|
||||
private String publicationdate;// dateofacceptance;
|
||||
|
||||
private APC articleprocessingcharge;
|
||||
|
||||
private String refereed; // peer-review status
|
||||
|
||||
public String getLicense() {
|
||||
|
@ -78,4 +82,11 @@ public class Instance implements Serializable {
|
|||
this.refereed = refereed;
|
||||
}
|
||||
|
||||
public APC getArticleprocessingcharge() {
|
||||
return articleprocessingcharge;
|
||||
}
|
||||
|
||||
public void setArticleprocessingcharge(APC articleprocessingcharge) {
|
||||
this.articleprocessingcharge = articleprocessingcharge;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ import java.io.Serializable;
|
|||
*/
|
||||
public class ResearchInitiative implements Serializable {
|
||||
private String id; // openaireId
|
||||
private String originalId; // context id
|
||||
private String acronym; // context id
|
||||
private String name; // context name
|
||||
private String type; // context type: research initiative or research community
|
||||
private String description;
|
||||
|
@ -57,12 +57,12 @@ public class ResearchInitiative implements Serializable {
|
|||
this.name = label;
|
||||
}
|
||||
|
||||
public String getOriginalId() {
|
||||
return originalId;
|
||||
public String getAcronym() {
|
||||
return acronym;
|
||||
}
|
||||
|
||||
public void setOriginalId(String originalId) {
|
||||
this.originalId = originalId;
|
||||
public void setAcronym(String acronym) {
|
||||
this.acronym = acronym;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
|
||||
import java.text.ParseException;
|
||||
|
@ -10,6 +8,8 @@ import java.util.*;
|
|||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
||||
/**
|
||||
* Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to
|
||||
* graph node identifiers and it is further characterised by the semantic of the link through the fields relType,
|
||||
|
@ -137,7 +137,10 @@ public class Relation extends Oaf {
|
|||
try {
|
||||
setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate()));
|
||||
} catch (ParseException e) {
|
||||
throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate()));
|
||||
throw new IllegalArgumentException(String
|
||||
.format(
|
||||
"invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(),
|
||||
getValidationDate()));
|
||||
}
|
||||
|
||||
super.mergeFrom(r);
|
||||
|
|
|
@ -4,6 +4,11 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
|||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -22,11 +27,6 @@ import eu.dnetlib.dhp.schema.oaf.Datasource;
|
|||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class PrepareResultInstRepoAssociation {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class);
|
||||
|
@ -56,9 +56,10 @@ public class PrepareResultInstRepoAssociation {
|
|||
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
||||
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
||||
|
||||
List<String> blacklist = Optional.ofNullable(parser.get("blacklist"))
|
||||
.map(v -> Arrays.asList(v.split(";")))
|
||||
.orElse(new ArrayList<>());
|
||||
List<String> blacklist = Optional
|
||||
.ofNullable(parser.get("blacklist"))
|
||||
.map(v -> Arrays.asList(v.split(";")))
|
||||
.orElse(new ArrayList<>());
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||
|
@ -91,14 +92,13 @@ public class PrepareResultInstRepoAssociation {
|
|||
private static void prepareDatasourceOrganization(
|
||||
SparkSession spark, String datasourceOrganizationPath, List<String> blacklist) {
|
||||
String blacklisted = "";
|
||||
if(blacklist.size() > 0 ){
|
||||
if (blacklist.size() > 0) {
|
||||
blacklisted = " AND d.id != '" + blacklist.get(0) + "'";
|
||||
for (int i = 1; i < blacklist.size(); i++) {
|
||||
blacklisted += " AND d.id != '" + blacklist.get(i) + "'";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
String query = "SELECT source datasourceId, target organizationId "
|
||||
+ "FROM ( SELECT id "
|
||||
+ "FROM datasource "
|
||||
|
|
|
@ -24,8 +24,6 @@ public class Constants {
|
|||
|
||||
public static String RESEARCH_INFRASTRUCTURE = "Research Infrastructure/Initiative";
|
||||
|
||||
public static String ORCID = "orcid";
|
||||
|
||||
static {
|
||||
accessRightsCoarMap.put("OPEN", "c_abf2");
|
||||
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
|
||||
|
|
|
@ -37,7 +37,8 @@ public class DumpProducts implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, dumpType);
|
||||
execDump(
|
||||
spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, dumpType);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -89,7 +90,7 @@ public class DumpProducts implements Serializable {
|
|||
return c.getId();
|
||||
}
|
||||
if (c.getId().contains("::") && communities.contains(c.getId().substring(0, c.getId().indexOf("::")))) {
|
||||
return c.getId().substring(0, 3);
|
||||
return c.getId().substring(0, c.getId().indexOf("::"));
|
||||
}
|
||||
return null;
|
||||
}).filter(Objects::nonNull).collect(Collectors.toList());
|
||||
|
|
|
@ -20,7 +20,7 @@ public class QueryInformationSystem {
|
|||
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
|
||||
+
|
||||
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
|
||||
" and ($x//context/param[./@name = 'status']/text() = 'manager' or $x//context/param[./@name = 'status']/text() = 'all') "
|
||||
" and ($x//context/param[./@name = 'status']/text() = 'all') "
|
||||
+
|
||||
" return " +
|
||||
"<community> " +
|
||||
|
|
|
@ -424,6 +424,19 @@ public class ResultMapper implements Serializable {
|
|||
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||
|
||||
Optional<Field<String>> oPca = Optional.ofNullable(i.getProcessingchargeamount());
|
||||
Optional<Field<String>> oPcc = Optional.ofNullable(i.getProcessingchargecurrency());
|
||||
if (oPca.isPresent() && oPcc.isPresent()) {
|
||||
Field<String> pca = oPca.get();
|
||||
Field<String> pcc = oPcc.get();
|
||||
if (!pca.getValue().trim().equals("") && !pcc.getValue().trim().equals("")) {
|
||||
APC apc = new APC();
|
||||
apc.setCurrency(oPcc.get().getValue());
|
||||
apc.setAmount(oPca.get().getValue());
|
||||
instance.setArticleprocessingcharge(apc);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static List<Provenance> getUniqueProvenance(List<Provenance> provenance) {
|
||||
|
@ -503,7 +516,7 @@ public class ResultMapper implements Serializable {
|
|||
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(Constants.ORCID)) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
|
@ -9,6 +10,7 @@ import org.apache.commons.logging.Log;
|
|||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
|
||||
|
@ -48,15 +50,12 @@ public class SendToZenodoHDFS implements Serializable {
|
|||
.orElse(false);
|
||||
|
||||
final String depositionId = Optional.ofNullable(parser.get("depositionId")).orElse(null);
|
||||
final String communityMapPath = parser.get("communityMapPath");
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
CommunityMap communityMap = Utils.readCommunityMap(fileSystem, communityMapPath);
|
||||
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
.listFiles(
|
||||
new Path(hdfsPath), true);
|
||||
|
@ -87,11 +86,6 @@ public class SendToZenodoHDFS implements Serializable {
|
|||
if (!p_string.endsWith("_SUCCESS")) {
|
||||
// String tmp = p_string.substring(0, p_string.lastIndexOf("/"));
|
||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
log.info("Sending information for community: " + name);
|
||||
if (communityMap.containsKey(name.substring(0, name.lastIndexOf(".")))) {
|
||||
name = communityMap.get(name.substring(0, name.lastIndexOf("."))).replace(" ", "_") + ".tar";
|
||||
}
|
||||
|
||||
FSDataInputStream inputStream = fileSystem.open(p);
|
||||
zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen());
|
||||
|
||||
|
|
|
@ -34,12 +34,14 @@ public class CommunitySplit implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
execSplit(spark, inputPath, outputPath, Utils.getCommunityMap(spark, communityMapPath).keySet());
|
||||
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||
|
||||
execSplit(spark, inputPath, outputPath, communityMap);
|
||||
});
|
||||
}
|
||||
|
||||
private static void execSplit(SparkSession spark, String inputPath, String outputPath,
|
||||
Set<String> communities) {
|
||||
CommunityMap communities) {
|
||||
|
||||
Dataset<CommunityResult> result = Utils
|
||||
.readPath(spark, inputPath + "/publication", CommunityResult.class)
|
||||
|
@ -48,8 +50,9 @@ public class CommunitySplit implements Serializable {
|
|||
.union(Utils.readPath(spark, inputPath + "/software", CommunityResult.class));
|
||||
|
||||
communities
|
||||
.keySet()
|
||||
.stream()
|
||||
.forEach(c -> printResult(c, result, outputPath));
|
||||
.forEach(c -> printResult(c, result, outputPath + "/" + communities.get(c).replace(" ", "_")));
|
||||
|
||||
}
|
||||
|
||||
|
@ -57,16 +60,11 @@ public class CommunitySplit implements Serializable {
|
|||
Dataset<CommunityResult> community_products = result
|
||||
.filter((FilterFunction<CommunityResult>) r -> containsCommunity(r, c));
|
||||
|
||||
try {
|
||||
community_products.first();
|
||||
community_products
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath + "/" + c);
|
||||
} catch (Exception e) {
|
||||
|
||||
}
|
||||
community_products
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
|
||||
}
|
||||
|
||||
|
@ -75,9 +73,9 @@ public class CommunitySplit implements Serializable {
|
|||
return r
|
||||
.getContext()
|
||||
.stream()
|
||||
.filter(con -> con.getCode().equals(c))
|
||||
.map(con -> con.getCode())
|
||||
.collect(Collectors.toList())
|
||||
.size() > 0;
|
||||
.contains(c);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,85 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class RemoveCommunities implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(RemoveCommunities.class);
|
||||
private final static ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private final Configuration conf;
|
||||
private final BufferedWriter writer;
|
||||
private final CommunityMap communityMap;
|
||||
|
||||
public RemoveCommunities(String path, String hdfsNameNode) throws IOException {
|
||||
conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
Path hdfsPath = new Path(path);
|
||||
// FSDataInputStream p = fileSystem.open(hdfsPath);
|
||||
// ObjectMapper mapper = new ObjectMapper();
|
||||
communityMap = OBJECT_MAPPER.readValue((InputStream) fileSystem.open(hdfsPath), CommunityMap.class);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsPath)) {
|
||||
fileSystem.delete(hdfsPath);
|
||||
}
|
||||
fsDataOutputStream = fileSystem.create(hdfsPath);
|
||||
|
||||
writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
RemoveCommunities.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/input_rc_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String nameNode = parser.get("nameNode");
|
||||
log.info("nameNode: {}", nameNode);
|
||||
|
||||
final String outputPath = parser.get("path");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final String communityId = parser.get("communityId");
|
||||
|
||||
final RemoveCommunities scm = new RemoveCommunities(outputPath, nameNode);
|
||||
|
||||
scm.removeCommunities(communityId);
|
||||
|
||||
}
|
||||
|
||||
private void removeCommunities(String communityId) throws IOException {
|
||||
Set<String> toRemove = communityMap.keySet().stream().map(key -> {
|
||||
if (key.equals(communityId))
|
||||
return null;
|
||||
return key;
|
||||
}).filter(Objects::nonNull).collect(Collectors.toSet());
|
||||
|
||||
toRemove.forEach(key -> communityMap.remove(key));
|
||||
writer.write(OBJECT_MAPPER.writeValueAsString(communityMap));
|
||||
writer.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -70,10 +70,10 @@ public class CreateContextRelation implements Serializable {
|
|||
cce.execute(Process::getRelation, CONTEX_RELATION_DATASOURCE, ModelSupport.getIdPrefix(Datasource.class));
|
||||
|
||||
log.info("Creating relations for projects... ");
|
||||
// cce
|
||||
// .execute(
|
||||
// Process::getRelation, CONTEX_RELATION_PROJECT,
|
||||
// ModelSupport.getIdPrefix(eu.dnetlib.dhp.schema.oaf.Project.class));
|
||||
cce
|
||||
.execute(
|
||||
Process::getRelation, CONTEX_RELATION_PROJECT,
|
||||
ModelSupport.getIdPrefix(eu.dnetlib.dhp.schema.oaf.Project.class));
|
||||
|
||||
cce.close();
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ import java.util.*;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
|
@ -453,6 +454,7 @@ public class DumpGraphEntities implements Serializable {
|
|||
.map(
|
||||
(MapFunction<E, Organization>) o -> mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) o),
|
||||
Encoders.bean(Organization.class))
|
||||
.filter((FilterFunction<Organization>) o -> o != null)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
@ -461,6 +463,8 @@ public class DumpGraphEntities implements Serializable {
|
|||
|
||||
private static Organization mapOrganization(eu.dnetlib.dhp.schema.oaf.Organization org) {
|
||||
Organization organization = new Organization();
|
||||
if (org.getDataInfo().getDeletedbyinference())
|
||||
return null;
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getLegalshortname())
|
||||
|
|
|
@ -147,7 +147,7 @@ public class Extractor implements Serializable {
|
|||
.map(
|
||||
paction -> Provenance
|
||||
.newInstance(
|
||||
paction.getClassid(),
|
||||
paction.getClassname(),
|
||||
dinfo.getTrust()))
|
||||
.orElse(
|
||||
Provenance
|
||||
|
|
|
@ -35,7 +35,7 @@ public class Process implements Serializable {
|
|||
ri.setType(Constants.RESEARCH_INFRASTRUCTURE);
|
||||
}
|
||||
ri.setId(Utils.getContextId(ci.getId()));
|
||||
ri.setOriginalId(ci.getId());
|
||||
ri.setAcronym(ci.getId());
|
||||
|
||||
ri.setDescription(ci.getDescription());
|
||||
ri.setName(ci.getName());
|
||||
|
|
|
@ -12,6 +12,7 @@ import org.dom4j.Node;
|
|||
import org.dom4j.io.SAXReader;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
|
@ -113,14 +114,72 @@ public class QueryInformationSystem {
|
|||
@NotNull
|
||||
private List<String> getCategoryList(Element el, String prefix) {
|
||||
List<String> datasourceList = new ArrayList<>();
|
||||
for (Object node : el.selectNodes(".//param")) {
|
||||
Node n = (Node) node;
|
||||
if (n.valueOf("./@name").equals("openaireId")) {
|
||||
datasourceList.add(prefix + "|" + n.getText());
|
||||
}
|
||||
for (Object node : el.selectNodes(".//concept")) {
|
||||
String oid = getOpenaireId((Node) node, prefix);
|
||||
if (oid != null)
|
||||
datasourceList.add(oid);
|
||||
}
|
||||
|
||||
return datasourceList;
|
||||
}
|
||||
|
||||
private String getOpenaireId(Node el, String prefix) {
|
||||
|
||||
for (Object node : el.selectNodes(".//param")) {
|
||||
Node n = (Node) node;
|
||||
if (n.valueOf("./@name").equals("openaireId")) {
|
||||
return prefix + "|" + n.getText();
|
||||
}
|
||||
}
|
||||
return makeOpenaireId(el, prefix);
|
||||
|
||||
}
|
||||
|
||||
private String makeOpenaireId(Node el, String prefix) {
|
||||
String funder = null;
|
||||
String grantId = null;
|
||||
String funding = null;
|
||||
for (Object node : el.selectNodes(".//param")) {
|
||||
Node n = (Node) node;
|
||||
switch (n.valueOf("./@name")) {
|
||||
case "funding":
|
||||
funding = n.getText();
|
||||
break;
|
||||
case "funder":
|
||||
funder = n.getText();
|
||||
break;
|
||||
case "CD_PROJECT_NUMBER":
|
||||
grantId = n.getText();
|
||||
break;
|
||||
}
|
||||
}
|
||||
String nsp = null;
|
||||
switch (funder.toLowerCase()) {
|
||||
case "ec":
|
||||
if (funding == null) {
|
||||
return null;
|
||||
}
|
||||
if (funding.toLowerCase().contains("h2020")) {
|
||||
nsp = "corda__h2020::";
|
||||
} else {
|
||||
nsp = "corda_______::";
|
||||
}
|
||||
break;
|
||||
case "tubitak":
|
||||
nsp = "tubitakf____::";
|
||||
break;
|
||||
case "dfg":
|
||||
nsp = "dfgf________::";
|
||||
break;
|
||||
default:
|
||||
nsp = funder.toLowerCase();
|
||||
for (int i = funder.length(); i < 12; i++)
|
||||
nsp += "_";
|
||||
nsp += "::";
|
||||
}
|
||||
|
||||
return prefix + "|" + nsp + DHPUtils.md5(grantId);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
* It selects the valid relations among those present in the graph. One relation is valid if it is not deletedbyinference
|
||||
* and if both the source and the target node are present in the graph and are not deleted by inference nor invisible.
|
||||
* To check this I made a view of the ids of all the entities in the graph, and select the relations for which a join exists
|
||||
* with this view for both the source and the target
|
||||
*/
|
||||
|
||||
public class SparkSelectValidRelationsJob implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkSelectValidRelationsJob.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkSelectValidRelationsJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
selectValidRelation(spark, inputPath, outputPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void selectValidRelation(SparkSession spark, String inputPath, String outputPath) {
|
||||
Dataset<Relation> relation = Utils.readPath(spark, inputPath + "/relation", Relation.class);
|
||||
Dataset<Publication> publication = Utils.readPath(spark, inputPath + "/publication", Publication.class);
|
||||
Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> dataset = Utils
|
||||
.readPath(spark, inputPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
Dataset<Software> software = Utils.readPath(spark, inputPath + "/software", Software.class);
|
||||
Dataset<OtherResearchProduct> other = Utils
|
||||
.readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class);
|
||||
Dataset<Organization> organization = Utils.readPath(spark, inputPath + "/organization", Organization.class);
|
||||
Dataset<Project> project = Utils.readPath(spark, inputPath + "/project", Project.class);
|
||||
Dataset<Datasource> datasource = Utils.readPath(spark, inputPath + "/datasource", Datasource.class);
|
||||
|
||||
relation.createOrReplaceTempView("relation");
|
||||
publication.createOrReplaceTempView("publication");
|
||||
dataset.createOrReplaceTempView("dataset");
|
||||
other.createOrReplaceTempView("other");
|
||||
software.createOrReplaceTempView("software");
|
||||
organization.createOrReplaceTempView("organization");
|
||||
project.createOrReplaceTempView("project");
|
||||
datasource.createOrReplaceTempView("datasource");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"SELECT id " +
|
||||
"FROM publication " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM dataset " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM other " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM software " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM organization " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM project " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false " +
|
||||
"UNION ALL " +
|
||||
"SELECT id " +
|
||||
"FROM datasource " +
|
||||
"WHERE datainfo.deletedbyinference = false AND datainfo.invisible = false ")
|
||||
.createOrReplaceTempView("identifiers");
|
||||
|
||||
spark
|
||||
.sql(
|
||||
"SELECT relation.* " +
|
||||
"FROM relation " +
|
||||
"JOIN identifiers i1 " +
|
||||
"ON source = i1.id " +
|
||||
"JOIN identifiers i2 " +
|
||||
"ON target = i2.id " +
|
||||
"WHERE datainfo.deletedbyinference = false")
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(outputPath);
|
||||
;
|
||||
|
||||
}
|
||||
}
|
|
@ -102,19 +102,26 @@ public class SparkDumpFunderResults implements Serializable {
|
|||
} else {
|
||||
funderdump = fundernsp.substring(0, fundernsp.indexOf("_")).toUpperCase();
|
||||
}
|
||||
writeFunderResult(funder, result, outputPath + "/" + funderdump);
|
||||
writeFunderResult(funder, result, outputPath, funderdump);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void writeFunderResult(String funder, Dataset<CommunityResult> results, String outputPath) {
|
||||
private static void dumpResults(String nsp, Dataset<CommunityResult> results, String outputPath,
|
||||
String funderName) {
|
||||
|
||||
results.map((MapFunction<CommunityResult, CommunityResult>) r -> {
|
||||
if (!Optional.ofNullable(r.getProjects()).isPresent()) {
|
||||
return null;
|
||||
}
|
||||
for (Project p : r.getProjects()) {
|
||||
if (p.getId().startsWith(funder)) {
|
||||
if (p.getId().startsWith(nsp)) {
|
||||
if (nsp.startsWith("40|irb")) {
|
||||
if (p.getFunder().getShortName().equals(funderName))
|
||||
return r;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
@ -124,7 +131,18 @@ public class SparkDumpFunderResults implements Serializable {
|
|||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
.json(outputPath + "/" + funderName);
|
||||
}
|
||||
|
||||
private static void writeFunderResult(String funder, Dataset<CommunityResult> results, String outputPath,
|
||||
String funderDump) {
|
||||
|
||||
if (funder.startsWith("40|irb")) {
|
||||
dumpResults(funder, results, outputPath, "HRZZ");
|
||||
dumpResults(funder, results, outputPath, "MZOS");
|
||||
} else
|
||||
dumpResults(funder, results, outputPath, funderDump);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,18 @@
|
|||
<workflow-app name="dump_community_products" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>singleDeposition</name>
|
||||
<description>Indicates if each file in the directory should be uploaded in a own deposition</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>upload</name>
|
||||
<description>true if the dump should be upload in zenodo</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>communityId</name>
|
||||
<description>the id of the community to be dumped if a dump for a single community should be done</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
|
@ -123,6 +135,24 @@
|
|||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
</java>
|
||||
<ok to="single_deposition"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="single_deposition">
|
||||
<switch>
|
||||
<case to="remove_communities">${wf:conf('singleDeposition') eq true}</case>
|
||||
<default to="fork_dump"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="remove_communities">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.dump.community.RemoveCommunities</main-class>
|
||||
<arg>--path</arg><arg>${workingDir}/communityMap</arg>
|
||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--communityId</arg><arg>${communityId}</arg>
|
||||
</java>
|
||||
<ok to="fork_dump"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
@ -405,10 +435,16 @@
|
|||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/split</arg>
|
||||
</java>
|
||||
<ok to="send_zenodo"/>
|
||||
<ok to="should_upload"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="should_upload">
|
||||
<switch>
|
||||
<case to="send_zenodo">${wf:conf('upload') eq true}</case>
|
||||
<default to="End"/>
|
||||
</switch>
|
||||
</decision>
|
||||
<action name="send_zenodo">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
|
||||
|
@ -417,7 +453,6 @@
|
|||
<arg>--accessToken</arg><arg>${accessToken}</arg>
|
||||
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg>
|
||||
<arg>--metadata</arg><arg>${metadata}</arg>
|
||||
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
|
||||
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
|
||||
<arg>--depositionId</arg><arg>${depositionId}</arg>
|
||||
<arg>--depositionType</arg><arg>${depositionType}</arg>
|
||||
|
|
|
@ -148,7 +148,7 @@
|
|||
<path start="dump_organization"/>
|
||||
<path start="dump_project"/>
|
||||
<path start="dump_datasource"/>
|
||||
<path start="dump_relation"/>
|
||||
<path start="select_relation"/>
|
||||
</fork>
|
||||
|
||||
<action name="dump_publication">
|
||||
|
@ -333,6 +333,30 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="select_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Select valid table relation </name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkSelectValidRelationsJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/validrelation</arg>
|
||||
</spark>
|
||||
<ok to="dump_relation"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="dump_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -350,7 +374,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingDir}/validrelation</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation/relation</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
|
@ -572,7 +596,6 @@
|
|||
<arg>--accessToken</arg><arg>${accessToken}</arg>
|
||||
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg>
|
||||
<arg>--metadata</arg><arg>${metadata}</arg>
|
||||
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
|
||||
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
|
||||
<arg>--depositionType</arg><arg>${depositionType}</arg>
|
||||
<arg>--depositionId</arg><arg>${depositionId}</arg>
|
||||
|
|
|
@ -1,6 +1,23 @@
|
|||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"definitions": {
|
||||
"AccessRight":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"code": {
|
||||
"type": "string",
|
||||
"description": "COAR access mode code: http://vocabularies.coar-repositories.org/documentation/access_rights/"
|
||||
},
|
||||
"label": {
|
||||
"type": "string",
|
||||
"description": "Label for the access mode"
|
||||
},
|
||||
"scheme": {
|
||||
"type": "string",
|
||||
"description": "Scheme of reference for access right code. Always set to COAR access rights vocabulary: http://vocabularies.coar-repositories.org/documentation/access_rights/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"ControlledField": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -266,6 +283,57 @@
|
|||
]
|
||||
}
|
||||
},
|
||||
"instance":{
|
||||
"type":"array",
|
||||
"items":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"accessright":{
|
||||
"allOf":[
|
||||
{
|
||||
"$ref":"#/definitions/AccessRight"
|
||||
},
|
||||
{
|
||||
"description":"The accessright of this materialization of the result"
|
||||
}
|
||||
]
|
||||
},
|
||||
"articleprocessingcharge":{
|
||||
"type":"object",
|
||||
"properties":{
|
||||
"amount":{
|
||||
"type":"string"
|
||||
},
|
||||
"currency":{
|
||||
"type":"string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"license":{
|
||||
"type":"string"
|
||||
},
|
||||
"publicationdate":{
|
||||
"type":"string"
|
||||
},
|
||||
"refereed":{
|
||||
"type":"string"
|
||||
},
|
||||
"type":{
|
||||
"type":"string",
|
||||
"description":"The specific sub-type of this materialization of the result (see https://api.openaire.eu/vocabularies/dnet:result_typologies following the links)"
|
||||
},
|
||||
"url":{
|
||||
"description":"Description of url",
|
||||
"type":"array",
|
||||
"items":{
|
||||
"type":"string",
|
||||
"description":"urls where it is possible to access the materialization of the result"
|
||||
}
|
||||
}
|
||||
},
|
||||
"description":"One of the materialization for this result"
|
||||
}
|
||||
},
|
||||
"programmingLanguage": {
|
||||
"type": "string",
|
||||
"description": "Only for results with type 'software': the programming language"
|
||||
|
@ -302,7 +370,7 @@
|
|||
"subject": {
|
||||
"allOf": [
|
||||
{"$ref": "#/definitions/ControlledField"},
|
||||
{"description": "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies) and value. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)."},
|
||||
{"description": "OpenAIRE subject classification scheme (https://api.openaire.eu/vocabularies/dnet:subject_classification_typologies) and value. When the scheme is 'keyword', it means that the subject is free-text (i.e. not a term from a controlled vocabulary)."}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
<workflow-app name="dump_funder_results" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>upload</name>
|
||||
<value>false</value>
|
||||
<value>true</value>
|
||||
<description>true to upload the dump for the funders in Zenodo</description>
|
||||
</property>
|
||||
<property>
|
||||
|
@ -549,7 +548,6 @@
|
|||
<arg>--accessToken</arg><arg>${accessToken}</arg>
|
||||
<arg>--connectionUrl</arg><arg>${connectionUrl}</arg>
|
||||
<arg>--metadata</arg><arg>${metadata}</arg>
|
||||
<arg>--communityMapPath</arg><arg>${workingDir}/communityMap</arg>
|
||||
<arg>--conceptRecordId</arg><arg>${conceptRecordId}</arg>
|
||||
<arg>--depositionType</arg><arg>${depositionType}</arg>
|
||||
<arg>--depositionId</arg><arg>${depositionId}</arg>
|
||||
|
|
|
@ -35,7 +35,12 @@
|
|||
"paramLongName":"dumpType",
|
||||
"paramDescription": "the type of the dump (complete for the whole graph, community for the products related to communities, funder for the results with at least a link to project",
|
||||
"paramRequired": false
|
||||
}
|
||||
}, {
|
||||
"paramName":"cid",
|
||||
"paramLongName":"communityId",
|
||||
"paramDescription": "the id of the community to be dumped",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
[
|
||||
|
||||
{
|
||||
"paramName":"ci",
|
||||
"paramLongName":"communityId",
|
||||
"paramDescription": "URL of the isLookUp Service",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"nn",
|
||||
"paramLongName":"nameNode",
|
||||
"paramDescription": "the name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "p",
|
||||
"paramLongName": "path",
|
||||
"paramDescription": "the path used to store temporary output files",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -25,6 +25,12 @@
|
|||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "cid",
|
||||
"paramLongName": "communityId",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
||||
|
||||
|
|
|
@ -12,12 +12,6 @@
|
|||
"paramDescription": "The id of the concept record for a new version",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"cmp",
|
||||
"paramLongName":"communityMapPath",
|
||||
"paramDescription": "the path to the serialization of the community map",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"di",
|
||||
"paramLongName":"depositionId",
|
||||
|
|
|
@ -13,6 +13,7 @@ import org.apache.spark.api.java.JavaRDD;
|
|||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -408,4 +409,53 @@ public class DumpJobTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArticlePCA() {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/resultDump/publication_pca")
|
||||
.getPath();
|
||||
|
||||
final String communityMapPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath();
|
||||
|
||||
DumpProducts dump = new DumpProducts();
|
||||
dump
|
||||
.run(
|
||||
// false, sourcePath, workingDir.toString() + "/result", communityMapPath, Publication.class,
|
||||
false, sourcePath, workingDir.toString() + "/result", communityMapPath, Publication.class,
|
||||
GraphResult.class, Constants.DUMPTYPE.COMPLETE.getType());
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<GraphResult> tmp = sc
|
||||
.textFile(workingDir.toString() + "/result")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, GraphResult.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<GraphResult> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(GraphResult.class));
|
||||
|
||||
Assertions.assertEquals(23, verificationDataset.count());
|
||||
// verificationDataset.show(false);
|
||||
|
||||
Assertions.assertEquals(23, verificationDataset.filter("type = 'publication'").count());
|
||||
|
||||
verificationDataset.createOrReplaceTempView("check");
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> temp = spark
|
||||
.sql(
|
||||
"select id " +
|
||||
"from check " +
|
||||
"lateral view explode (instance) i as inst " +
|
||||
"where inst.articleprocessingcharge is not null");
|
||||
|
||||
Assertions.assertTrue(temp.count() == 2);
|
||||
|
||||
Assertions.assertTrue(temp.filter("id = '50|datacite____::05c611fdfc93d7a2a703d1324e28104a'").count() == 1);
|
||||
|
||||
Assertions.assertTrue(temp.filter("id = '50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8'").count() == 1);
|
||||
|
||||
//TODO verify value and name of the fields for vocab related value (i.e. accessright, bestaccessright)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ import com.github.victools.jsonschema.generator.*;
|
|||
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
|
||||
|
||||
@Disabled
|
||||
//@Disabled
|
||||
public class GenerateJsonSchema {
|
||||
|
||||
@Test
|
||||
|
@ -21,7 +21,7 @@ public class GenerateJsonSchema {
|
|||
configBuilder.forFields().withDescriptionResolver(field -> "Description of " + field.getDeclaredName());
|
||||
SchemaGeneratorConfig config = configBuilder.build();
|
||||
SchemaGenerator generator = new SchemaGenerator(config);
|
||||
JsonNode jsonSchema = generator.generateSchema(Relation.class);
|
||||
JsonNode jsonSchema = generator.generateSchema(GraphResult.class);
|
||||
|
||||
System.out.println(jsonSchema.toString());
|
||||
}
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.community;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocalFileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.MakeTar;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.MakeTarTest;
|
||||
|
||||
public class RemoveCommunityTest {
|
||||
|
||||
private static String workingDir;
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(MakeTarTest.class.getSimpleName())
|
||||
.toString();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemove() throws Exception {
|
||||
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
||||
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json")
|
||||
.getPath()),
|
||||
new Path(workingDir + "/communityMap"));
|
||||
|
||||
String path = workingDir + "/communityMap";
|
||||
|
||||
RemoveCommunities.main(new String[] {
|
||||
"-nameNode", "local",
|
||||
"-path", path,
|
||||
"-communityId", "beopen"
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
CommunityMap cm = new ObjectMapper()
|
||||
.readValue(new FileInputStream(workingDir + "/communityMap"), CommunityMap.class);
|
||||
|
||||
Assertions.assertEquals(1, cm.size());
|
||||
Assertions.assertTrue(cm.containsKey("beopen"));
|
||||
|
||||
}
|
||||
}
|
|
@ -97,7 +97,7 @@ public class CreateEntityTest {
|
|||
Assertions.assertEquals(12, riList.size());
|
||||
|
||||
riList.stream().forEach(c -> {
|
||||
switch (c.getOriginalId()) {
|
||||
switch (c.getAcronym()) {
|
||||
case "mes":
|
||||
Assertions
|
||||
.assertTrue(c.getType().equals(eu.dnetlib.dhp.oa.graph.dump.Constants.RESEARCH_COMMUNITY));
|
||||
|
@ -115,9 +115,9 @@ public class CreateEntityTest {
|
|||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID, Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5(c.getOriginalId()))));
|
||||
DHPUtils.md5(c.getAcronym()))));
|
||||
Assertions.assertTrue(c.getZenodo_community().equals("https://zenodo.org/communities/oac_mes"));
|
||||
Assertions.assertTrue("mes".equals(c.getOriginalId()));
|
||||
Assertions.assertTrue("mes".equals(c.getAcronym()));
|
||||
break;
|
||||
case "clarin":
|
||||
Assertions
|
||||
|
@ -130,9 +130,9 @@ public class CreateEntityTest {
|
|||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID, Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5(c.getOriginalId()))));
|
||||
DHPUtils.md5(c.getAcronym()))));
|
||||
Assertions.assertTrue(c.getZenodo_community().equals("https://zenodo.org/communities/oac_clarin"));
|
||||
Assertions.assertTrue("clarin".equals(c.getOriginalId()));
|
||||
Assertions.assertTrue("clarin".equals(c.getAcronym()));
|
||||
break;
|
||||
}
|
||||
// TODO add check for all the others Entities
|
||||
|
|
|
@ -9,11 +9,14 @@ import org.junit.jupiter.api.Assertions;
|
|||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class CreateRelationTest {
|
||||
|
@ -203,6 +206,7 @@ public class CreateRelationTest {
|
|||
" <param name=\"suggestedAcknowledgement\"/>\n" +
|
||||
" <param name=\"zenodoCommunity\">oac_ni</param>\n" +
|
||||
" <param name=\"creationdate\">2018-03-01T12:00:00</param>\n" +
|
||||
" <category claim=\"false\" id=\"ni::projects\" label=\"NI Content providers\"/>\n" +
|
||||
" <category claim=\"false\" id=\"ni::contentproviders\" label=\"NI Content providers\">\n" +
|
||||
" <concept claim=\"false\" id=\"ni::contentproviders::1\" label=\"OpenNeuro\">\n" +
|
||||
" <param name=\"openaireId\">re3data_____::5b9bf9171d92df854cf3c520692e9122</param>\n" +
|
||||
|
@ -376,7 +380,7 @@ public class CreateRelationTest {
|
|||
" <param name=\"rule\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">675858</param>\n" +
|
||||
" <param name=\"url\"/>\n" +
|
||||
" <param name=\"funding\">H2020-EINFRA-2015-1</param>\n" +
|
||||
" <param name=\"funding\">EC | H2020 | RIA</param>\n" +
|
||||
" <param name=\"funder\">EC</param>\n" +
|
||||
" <param name=\"acronym\">West-Life</param>\n" +
|
||||
" </concept>\n" +
|
||||
|
@ -437,7 +441,65 @@ public class CreateRelationTest {
|
|||
" <param name=\"suggestedAcknowledgement\"/>\n" +
|
||||
" <param name=\"zenodoCommunity\">oaa_elixir-gr</param>\n" +
|
||||
" <param name=\"creationdate\">2018-03-01T12:00:00</param>\n" +
|
||||
" <category claim=\"false\" id=\"elixir-gr::projects\" label=\"ELIXIR GR Projects\"/>\n" +
|
||||
" <category claim=\"false\" id=\"elixir-gr::projects\" label=\"ELIXIR GR Projects\">\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::12\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">BIO-INFORMATICS RESEARCH NETWORK COORDINATING CENTER (BIRN-CC)</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">1U24RR025736-01</param>\n" +
|
||||
" <param name=\"funder\">NIH</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::13\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">COLLABORATIVE RESEARCH: The Cognitive Neuroscience of Category Learning</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">0223843</param>\n" +
|
||||
" <param name=\"funder\">NSF</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::14\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">The Cognitive Atlas: Developing an Interdisciplinary Knowledge Base Through Socia</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">5R01MH082795-05</param>\n" +
|
||||
" <param name=\"funder\">NIH</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::15\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">Fragmented early life environmental and emotional / cognitive vulnerabilities</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">1P50MH096889-01A1</param>\n" +
|
||||
" <param name=\"funder\">NIH</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::16\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">Enhancement of the 1000 Functional Connectome Project</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">1R03MH096321-01A1</param>\n" +
|
||||
" <param name=\"funder\">TUBITAK</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::17\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">CRCNS Data Sharing: An open data repository for cognitive neuroscience: The OpenfMRI Project</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">1131441</param>\n" +
|
||||
" <param name=\"funder\">NSF</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::18\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">Enhancing Human Cortical Plasticity: Visual Psychophysics and fMRI</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">0121950</param>\n" +
|
||||
" <param name=\"funder\">NSF</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::18\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">Transforming statistical methodology for neuroimaging meta-analysis.</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">100309</param>\n" +
|
||||
" <param name=\"funder\">WT</param>\n" +
|
||||
" </concept>\n" +
|
||||
" </category>" +
|
||||
|
||||
" <category claim=\"false\" id=\"elixir-gr::contentproviders\" label=\"Elixir-GR Content providers\">\n"
|
||||
+
|
||||
" <concept claim=\"false\" id=\"elixir-gr::contentproviders::1\" label=\"bio.tools\">\n" +
|
||||
|
@ -566,4 +628,98 @@ public class CreateRelationTest {
|
|||
tmp.contains("10|doajarticles::2899208a99aa7d142646e0a80bfeef05"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2() {
|
||||
List<ContextInfo> cInfoList = new ArrayList<>();
|
||||
final Consumer<ContextInfo> consumer = ci -> cInfoList.add(ci);
|
||||
|
||||
queryInformationSystem
|
||||
.getContextRelation(consumer, "projects", ModelSupport.getIdPrefix(Project.class));
|
||||
|
||||
cInfoList.forEach(c -> System.out.println(new Gson().toJson(c)));
|
||||
|
||||
List<Relation> rList = new ArrayList<>();
|
||||
|
||||
cInfoList.forEach(cInfo -> Process.getRelation(cInfo).forEach(rList::add));
|
||||
|
||||
Assertions.assertEquals(44, rList.size());
|
||||
|
||||
Assertions
|
||||
.assertFalse(
|
||||
rList
|
||||
.stream()
|
||||
.map(r -> r.getSource().getId())
|
||||
.collect(Collectors.toSet())
|
||||
.contains(
|
||||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID,
|
||||
Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5("dh-ch"))));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
rList
|
||||
.stream()
|
||||
.filter(
|
||||
r -> r
|
||||
.getSource()
|
||||
.getId()
|
||||
.equals(
|
||||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID,
|
||||
Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5("clarin"))))
|
||||
.collect(Collectors.toList())
|
||||
.size());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
rList
|
||||
.stream()
|
||||
.filter(
|
||||
r -> r
|
||||
.getTarget()
|
||||
.getId()
|
||||
.equals(
|
||||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID,
|
||||
Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5("clarin"))))
|
||||
.collect(Collectors.toList())
|
||||
.size());
|
||||
|
||||
Set<String> tmp = rList
|
||||
.stream()
|
||||
.filter(
|
||||
r -> r
|
||||
.getSource()
|
||||
.getId()
|
||||
.equals(
|
||||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID,
|
||||
Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5("clarin"))))
|
||||
.map(r -> r.getTarget().getId())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp.contains("40|corda__h2020::b5a4eb56bf84bef2ebc193306b4d423f") &&
|
||||
tmp.contains("40|corda_______::ef782b2d85676aa3e5a907427feb18c4"));
|
||||
|
||||
rList.forEach(rel -> {
|
||||
if (rel.getSource().getId().startsWith("40|")) {
|
||||
String proj = rel.getSource().getId().substring(3);
|
||||
Assertions.assertTrue(proj.substring(0, proj.indexOf("::")).length() == 12);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -88,7 +88,7 @@ public class DumpOrganizationProjectDatasourceTest {
|
|||
org.apache.spark.sql.Dataset<eu.dnetlib.dhp.schema.dump.oaf.graph.Organization> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Organization.class));
|
||||
|
||||
Assertions.assertEquals(34, verificationDataset.count());
|
||||
Assertions.assertEquals(15, verificationDataset.count());
|
||||
|
||||
verificationDataset
|
||||
.foreach(
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.dump.complete;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import net.sf.saxon.expr.instruct.ForEach;
|
||||
|
||||
public class SelectRelationTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(SelectRelationTest.class);
|
||||
|
||||
private static HashMap<String, String> map = new HashMap<>();
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(SelectRelationTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(SelectRelationTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SelectRelationTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1() throws Exception {
|
||||
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/dump/selectrelations")
|
||||
.getPath();
|
||||
|
||||
SparkSelectValidRelationsJob.main(new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-outputPath", workingDir.toString() + "/relation",
|
||||
"-sourcePath", sourcePath
|
||||
});
|
||||
|
||||
// dumpCommunityProducts.exec(MOCK_IS_LOOK_UP_URL,Boolean.FALSE, workingDir.toString()+"/dataset",sourcePath,"eu.dnetlib.dhp.schema.oaf.Dataset","eu.dnetlib.dhp.schema.dump.oaf.Dataset");
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<eu.dnetlib.dhp.schema.oaf.Relation> tmp = sc
|
||||
.textFile(workingDir.toString() + "/relation")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, eu.dnetlib.dhp.schema.oaf.Relation.class));
|
||||
|
||||
org.apache.spark.sql.Dataset<Relation> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(eu.dnetlib.dhp.schema.oaf.Relation.class));
|
||||
|
||||
Assertions.assertTrue(verificationDataset.count() == 7);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -137,5 +137,10 @@ public class SplitPerFunderTest {
|
|||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
Assertions.assertEquals(3, tmp.count());
|
||||
|
||||
// H2020 3
|
||||
tmp = sc
|
||||
.textFile(workingDir.toString() + "/split/MZOS")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, CommunityResult.class));
|
||||
Assertions.assertEquals(1, tmp.count());
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -5,4 +5,5 @@
|
|||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1603715146539,"properties":[],"relClass":"isProducedBy","relType":"datasourceOrganization","source":"10|doajarticles::8b75543067b50076e70764917e188178","subRelType":"provision","target":"40|snsf________::50cb15ff7a6a3f8531f063770179e346"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1603715146539,"properties":[],"relClass":"isProducedBy","relType":"datasourceOrganization","source":"10|doajarticles::9f3ff882f023209d9ffb4dc32b77d376","subRelType":"provision","target":"40|corda_______::ffc1811633b3222e4764c7b0517f83e8"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1603715146539,"properties":[],"relClass":"isProducedBy","relType":"datasourceOrganization","source":"10|doajarticles::b566fa319c3923454e1e8eb886ab62d2","subRelType":"provision","target":"40|nhmrc_______::4e6c928fef9851b37ec73f4f6daca35b"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1603715146539,"properties":[],"relClass":"isProducedBy","relType":"datasourceOrganization","source":"10|doajarticles::e0554fb004a155bc23cfb43ee9fc8eae","subRelType":"provision","target":"40|corda__h2020::846b777af165fef7c904a81712a83b66"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1603715146539,"properties":[],"relClass":"isProducedBy","relType":"datasourceOrganization","source":"10|doajarticles::e0554fb004a155bc23cfb43ee9fc8eae","subRelType":"provision","target":"40|corda__h2020::846b777af165fef7c904a81712a83b66"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1603715146539,"properties":[],"relClass":"isProducedBy","relType":"datasourceOrganization","source":"10|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"40|irb_hr______::1e5e62235d094afd01cd56e65112fc63"}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,15 @@
|
|||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::2baa9032dc058d3c8ff780c426b0c19f","subRelType":"provision","target":"20|dedup_wf_001::2899e571609779168222fdeb59cb916d"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::5ac587eb28411c351c2e357eb097fd3d","subRelType":"provision","target":"20|grid________::b91f67a34df55a0aa1aabdcb3700f413"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::690b3aaf177a4c70b81bacd8d023cbdc","subRelType":"provision","target":"20|doajarticles::396262ee936f3d3e26ff0e60bea6cae0"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":true,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|doajarticles::b566fa319c3923454e1e8eb886ab62d2","subRelType":"provision","target":"20|dedup_wf_001::4e6c928fef9851b37ec73f4f6daca35b"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|doajarticles::e0554fb004a155bc23cfb43ee9fc8eae","subRelType":"provision","target":"20|grid________::b91f67a34df55a0aa1aabdcb3700f413"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|doajarticles::fbf7592ddbf2ad3cc0ed70c0f2e1d67c","subRelType":"provision","target":"20|dedup_wf_001::1b965e2c0c53e5526d269d63bcfa0ae6"}
|
||||
{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|doajarticles::fd4c399077127f0ba09b5205e2b78406","subRelType":"provision","target":"50|doajarticles::1cae0b82b56ccd97c2db1f698def7074"}
|
||||
{"collectedfrom":[{"key":"10|infrastruct_::f66f1bd369679b5b077dcdf006089556","value":"OpenAIRE"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1594398578323,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|openaire____::8f991165fae922e29ad55d592f568464","subRelType":"provision","target":"50|openaire____::ec653e804967133b9436fdd30d3ff51d"}
|
||||
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|opendoar____::15231a7ce4ba789d13b722cc5c955834","subRelType":"provision","target":"50|dedup_wf_001::1ea4bcb1bae8c6befef1e7f1230f0f10"}
|
||||
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|opendoar____::16d11e9595188dbad0418a85f0351aba","subRelType":"provision","target":"40|opendoar____::041abd8c990fc531ab9bd2674a0e2725"}
|
||||
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|opendoar____::46d3f6029f6170ebccb28945964d09bf","subRelType":"provision","target":"40|opendoar____::a5fcb8eb25ebd6f7cd219e0fa1e6ddc1"}
|
||||
{"collectedfrom":[{"key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb","value":"OpenDOAR"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"50|opendoar____::7501e5d4da87ac39d782741cd794002d","subRelType":"provision","target":"40|dedup_wf_001::04e2c34ef4daa411ff2497afc807b612"}
|
||||
{"collectedfrom":[{"key":"10|openaire____::6ac933301a3933c8a22ceebea7000326","value":"Academy of Finland"}],"dataInfo":{"deletedbyinference":true,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900000000000000022"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isParticipant","relType":"projectOrganization","source":"20|aka_________::0cd5965141113df5739f1ac7ac7f6d37","subRelType":"participation","target":"40|aka_________::1bc716a1763110da3eb1af867de718a8"}
|
||||
{"collectedfrom":[{"key":"10|openaire____::6ac933301a3933c8a22ceebea7000326","value":"Academy of Finland"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900000000000000022"},"lastupdatetimestamp":1592688952862,"properties":[],"relClass":"isParticipant","relType":"projectOrganization","source":"20|aka_________::1e2df822bf0932ad0f77565789f22e17","subRelType":"participation","target":"40|aka_________::a6c805bcfd383bae043d8df38e79db78"}
|
||||
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"iis::document_affiliations","inferred":true,"invisible":false,"provenanceaction":{"classid":"iis","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.8966"},"lastupdatetimestamp":1595258695262,"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|aka_________::2c3aab6bce7516338b4dbfb4f6f86db7","subRelType":"affiliation","target":"40|dedup_wf_001::02859c30f6c8bfbdd8c427068a6ec684"}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue