forked from D-Net/dnet-hadoop
Compare commits
10 Commits
f95ec49a59
...
04a0d1ba6e
Author | SHA1 | Date |
---|---|---|
Miriam Baglioni | 04a0d1ba6e | |
Miriam Baglioni | 6b51b69cf7 | |
Miriam Baglioni | bd4b6b053d | |
Miriam Baglioni | 26b34201ec | |
Miriam Baglioni | 3d94c12d6e | |
Miriam Baglioni | 95c5f97259 | |
Miriam Baglioni | eaf86828e6 | |
Miriam Baglioni | c58206c3ba | |
Miriam Baglioni | 3e3a45d930 | |
Miriam Baglioni | 46a322b770 |
|
@ -19,7 +19,7 @@ import java.io.Serializable;
|
|||
*/
|
||||
public class ResearchInitiative implements Serializable {
|
||||
private String id; // openaireId
|
||||
private String originalId; // context id
|
||||
private String acronym; // context id
|
||||
private String name; // context name
|
||||
private String type; // context type: research initiative or research community
|
||||
private String description;
|
||||
|
@ -57,12 +57,12 @@ public class ResearchInitiative implements Serializable {
|
|||
this.name = label;
|
||||
}
|
||||
|
||||
public String getOriginalId() {
|
||||
return originalId;
|
||||
public String getAcronym() {
|
||||
return acronym;
|
||||
}
|
||||
|
||||
public void setOriginalId(String originalId) {
|
||||
this.originalId = originalId;
|
||||
public void setAcronym(String acronym) {
|
||||
this.acronym = acronym;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
|
||||
import java.text.ParseException;
|
||||
|
@ -10,6 +8,8 @@ import java.util.*;
|
|||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
||||
/**
|
||||
* Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to
|
||||
* graph node identifiers and it is further characterised by the semantic of the link through the fields relType,
|
||||
|
@ -137,7 +137,10 @@ public class Relation extends Oaf {
|
|||
try {
|
||||
setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate()));
|
||||
} catch (ParseException e) {
|
||||
throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate()));
|
||||
throw new IllegalArgumentException(String
|
||||
.format(
|
||||
"invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(),
|
||||
getValidationDate()));
|
||||
}
|
||||
|
||||
super.mergeFrom(r);
|
||||
|
|
|
@ -4,6 +4,11 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
|||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -22,11 +27,6 @@ import eu.dnetlib.dhp.schema.oaf.Datasource;
|
|||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class PrepareResultInstRepoAssociation {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class);
|
||||
|
@ -56,9 +56,10 @@ public class PrepareResultInstRepoAssociation {
|
|||
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
||||
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
||||
|
||||
List<String> blacklist = Optional.ofNullable(parser.get("blacklist"))
|
||||
.map(v -> Arrays.asList(v.split(";")))
|
||||
.orElse(new ArrayList<>());
|
||||
List<String> blacklist = Optional
|
||||
.ofNullable(parser.get("blacklist"))
|
||||
.map(v -> Arrays.asList(v.split(";")))
|
||||
.orElse(new ArrayList<>());
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
||||
|
@ -91,14 +92,13 @@ public class PrepareResultInstRepoAssociation {
|
|||
private static void prepareDatasourceOrganization(
|
||||
SparkSession spark, String datasourceOrganizationPath, List<String> blacklist) {
|
||||
String blacklisted = "";
|
||||
if(blacklist.size() > 0 ){
|
||||
if (blacklist.size() > 0) {
|
||||
blacklisted = " AND d.id != '" + blacklist.get(0) + "'";
|
||||
for (int i = 1; i < blacklist.size(); i++) {
|
||||
blacklisted += " AND d.id != '" + blacklist.get(i) + "'";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
String query = "SELECT source datasourceId, target organizationId "
|
||||
+ "FROM ( SELECT id "
|
||||
+ "FROM datasource "
|
||||
|
|
|
@ -70,10 +70,10 @@ public class CreateContextRelation implements Serializable {
|
|||
cce.execute(Process::getRelation, CONTEX_RELATION_DATASOURCE, ModelSupport.getIdPrefix(Datasource.class));
|
||||
|
||||
log.info("Creating relations for projects... ");
|
||||
// cce
|
||||
// .execute(
|
||||
// Process::getRelation, CONTEX_RELATION_PROJECT,
|
||||
// ModelSupport.getIdPrefix(eu.dnetlib.dhp.schema.oaf.Project.class));
|
||||
cce
|
||||
.execute(
|
||||
Process::getRelation, CONTEX_RELATION_PROJECT,
|
||||
ModelSupport.getIdPrefix(eu.dnetlib.dhp.schema.oaf.Project.class));
|
||||
|
||||
cce.close();
|
||||
|
||||
|
|
|
@ -147,7 +147,7 @@ public class Extractor implements Serializable {
|
|||
.map(
|
||||
paction -> Provenance
|
||||
.newInstance(
|
||||
paction.getClassid(),
|
||||
paction.getClassname(),
|
||||
dinfo.getTrust()))
|
||||
.orElse(
|
||||
Provenance
|
||||
|
|
|
@ -35,7 +35,7 @@ public class Process implements Serializable {
|
|||
ri.setType(Constants.RESEARCH_INFRASTRUCTURE);
|
||||
}
|
||||
ri.setId(Utils.getContextId(ci.getId()));
|
||||
ri.setOriginalId(ci.getId());
|
||||
ri.setAcronym(ci.getId());
|
||||
|
||||
ri.setDescription(ci.getDescription());
|
||||
ri.setName(ci.getName());
|
||||
|
|
|
@ -12,6 +12,7 @@ import org.dom4j.Node;
|
|||
import org.dom4j.io.SAXReader;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
|
@ -113,14 +114,72 @@ public class QueryInformationSystem {
|
|||
@NotNull
|
||||
private List<String> getCategoryList(Element el, String prefix) {
|
||||
List<String> datasourceList = new ArrayList<>();
|
||||
for (Object node : el.selectNodes(".//param")) {
|
||||
Node n = (Node) node;
|
||||
if (n.valueOf("./@name").equals("openaireId")) {
|
||||
datasourceList.add(prefix + "|" + n.getText());
|
||||
}
|
||||
for (Object node : el.selectNodes(".//concept")) {
|
||||
String oid = getOpenaireId((Node) node, prefix);
|
||||
if (oid != null)
|
||||
datasourceList.add(oid);
|
||||
}
|
||||
|
||||
return datasourceList;
|
||||
}
|
||||
|
||||
private String getOpenaireId(Node el, String prefix) {
|
||||
|
||||
for (Object node : el.selectNodes(".//param")) {
|
||||
Node n = (Node) node;
|
||||
if (n.valueOf("./@name").equals("openaireId")) {
|
||||
return prefix + "|" + n.getText();
|
||||
}
|
||||
}
|
||||
return makeOpenaireId(el, prefix);
|
||||
|
||||
}
|
||||
|
||||
private String makeOpenaireId(Node el, String prefix) {
|
||||
String funder = null;
|
||||
String grantId = null;
|
||||
String funding = null;
|
||||
for (Object node : el.selectNodes(".//param")) {
|
||||
Node n = (Node) node;
|
||||
switch (n.valueOf("./@name")) {
|
||||
case "funding":
|
||||
funding = n.getText();
|
||||
break;
|
||||
case "funder":
|
||||
funder = n.getText();
|
||||
break;
|
||||
case "CD_PROJECT_NUMBER":
|
||||
grantId = n.getText();
|
||||
break;
|
||||
}
|
||||
}
|
||||
String nsp = null;
|
||||
switch (funder.toLowerCase()) {
|
||||
case "ec":
|
||||
if (funding == null) {
|
||||
return null;
|
||||
}
|
||||
if (funding.toLowerCase().startsWith("h2020")) {
|
||||
nsp = "corda__h2020::";
|
||||
} else {
|
||||
nsp = "corda_______::";
|
||||
}
|
||||
break;
|
||||
case "tubitak":
|
||||
nsp = "tubitakf____::";
|
||||
break;
|
||||
case "dfg":
|
||||
nsp = "dfgf________::";
|
||||
break;
|
||||
default:
|
||||
nsp = funder.toLowerCase();
|
||||
for (int i = funder.length(); i < 12; i++)
|
||||
nsp += "_";
|
||||
nsp += "::";
|
||||
}
|
||||
|
||||
return prefix + "|" + nsp + DHPUtils.md5(grantId);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -97,7 +97,7 @@ public class CreateEntityTest {
|
|||
Assertions.assertEquals(12, riList.size());
|
||||
|
||||
riList.stream().forEach(c -> {
|
||||
switch (c.getOriginalId()) {
|
||||
switch (c.getAcronym()) {
|
||||
case "mes":
|
||||
Assertions
|
||||
.assertTrue(c.getType().equals(eu.dnetlib.dhp.oa.graph.dump.Constants.RESEARCH_COMMUNITY));
|
||||
|
@ -115,9 +115,9 @@ public class CreateEntityTest {
|
|||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID, Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5(c.getOriginalId()))));
|
||||
DHPUtils.md5(c.getAcronym()))));
|
||||
Assertions.assertTrue(c.getZenodo_community().equals("https://zenodo.org/communities/oac_mes"));
|
||||
Assertions.assertTrue("mes".equals(c.getOriginalId()));
|
||||
Assertions.assertTrue("mes".equals(c.getAcronym()));
|
||||
break;
|
||||
case "clarin":
|
||||
Assertions
|
||||
|
@ -130,9 +130,9 @@ public class CreateEntityTest {
|
|||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID, Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5(c.getOriginalId()))));
|
||||
DHPUtils.md5(c.getAcronym()))));
|
||||
Assertions.assertTrue(c.getZenodo_community().equals("https://zenodo.org/communities/oac_clarin"));
|
||||
Assertions.assertTrue("clarin".equals(c.getOriginalId()));
|
||||
Assertions.assertTrue("clarin".equals(c.getAcronym()));
|
||||
break;
|
||||
}
|
||||
// TODO add check for all the others Entities
|
||||
|
|
|
@ -9,11 +9,14 @@ import org.junit.jupiter.api.Assertions;
|
|||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.Gson;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class CreateRelationTest {
|
||||
|
@ -203,6 +206,7 @@ public class CreateRelationTest {
|
|||
" <param name=\"suggestedAcknowledgement\"/>\n" +
|
||||
" <param name=\"zenodoCommunity\">oac_ni</param>\n" +
|
||||
" <param name=\"creationdate\">2018-03-01T12:00:00</param>\n" +
|
||||
" <category claim=\"false\" id=\"ni::projects\" label=\"NI Content providers\"/>\n" +
|
||||
" <category claim=\"false\" id=\"ni::contentproviders\" label=\"NI Content providers\">\n" +
|
||||
" <concept claim=\"false\" id=\"ni::contentproviders::1\" label=\"OpenNeuro\">\n" +
|
||||
" <param name=\"openaireId\">re3data_____::5b9bf9171d92df854cf3c520692e9122</param>\n" +
|
||||
|
@ -437,7 +441,65 @@ public class CreateRelationTest {
|
|||
" <param name=\"suggestedAcknowledgement\"/>\n" +
|
||||
" <param name=\"zenodoCommunity\">oaa_elixir-gr</param>\n" +
|
||||
" <param name=\"creationdate\">2018-03-01T12:00:00</param>\n" +
|
||||
" <category claim=\"false\" id=\"elixir-gr::projects\" label=\"ELIXIR GR Projects\"/>\n" +
|
||||
" <category claim=\"false\" id=\"elixir-gr::projects\" label=\"ELIXIR GR Projects\">\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::12\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">BIO-INFORMATICS RESEARCH NETWORK COORDINATING CENTER (BIRN-CC)</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">1U24RR025736-01</param>\n" +
|
||||
" <param name=\"funder\">NIH</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::13\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">COLLABORATIVE RESEARCH: The Cognitive Neuroscience of Category Learning</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">0223843</param>\n" +
|
||||
" <param name=\"funder\">NSF</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::14\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">The Cognitive Atlas: Developing an Interdisciplinary Knowledge Base Through Socia</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">5R01MH082795-05</param>\n" +
|
||||
" <param name=\"funder\">NIH</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::15\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">Fragmented early life environmental and emotional / cognitive vulnerabilities</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">1P50MH096889-01A1</param>\n" +
|
||||
" <param name=\"funder\">NIH</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::16\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">Enhancement of the 1000 Functional Connectome Project</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">1R03MH096321-01A1</param>\n" +
|
||||
" <param name=\"funder\">TUBITAK</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::17\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">CRCNS Data Sharing: An open data repository for cognitive neuroscience: The OpenfMRI Project</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">1131441</param>\n" +
|
||||
" <param name=\"funder\">NSF</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::18\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">Enhancing Human Cortical Plasticity: Visual Psychophysics and fMRI</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">0121950</param>\n" +
|
||||
" <param name=\"funder\">NSF</param>\n" +
|
||||
" </concept>\n" +
|
||||
" <concept claim=\"false\" id=\"ni::projects::18\" label=\"\">\n" +
|
||||
" <param name=\"projectfullname\">Transforming statistical methodology for neuroimaging meta-analysis.</param>\n"
|
||||
+
|
||||
" <param name=\"acronym\"/>\n" +
|
||||
" <param name=\"CD_PROJECT_NUMBER\">100309</param>\n" +
|
||||
" <param name=\"funder\">WT</param>\n" +
|
||||
" </concept>\n" +
|
||||
" </category>" +
|
||||
|
||||
" <category claim=\"false\" id=\"elixir-gr::contentproviders\" label=\"Elixir-GR Content providers\">\n"
|
||||
+
|
||||
" <concept claim=\"false\" id=\"elixir-gr::contentproviders::1\" label=\"bio.tools\">\n" +
|
||||
|
@ -566,4 +628,98 @@ public class CreateRelationTest {
|
|||
tmp.contains("10|doajarticles::2899208a99aa7d142646e0a80bfeef05"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2() {
|
||||
List<ContextInfo> cInfoList = new ArrayList<>();
|
||||
final Consumer<ContextInfo> consumer = ci -> cInfoList.add(ci);
|
||||
|
||||
queryInformationSystem
|
||||
.getContextRelation(consumer, "projects", ModelSupport.getIdPrefix(Project.class));
|
||||
|
||||
cInfoList.forEach(c -> System.out.println(new Gson().toJson(c)));
|
||||
|
||||
List<Relation> rList = new ArrayList<>();
|
||||
|
||||
cInfoList.forEach(cInfo -> Process.getRelation(cInfo).forEach(rList::add));
|
||||
|
||||
Assertions.assertEquals(44, rList.size());
|
||||
|
||||
Assertions
|
||||
.assertFalse(
|
||||
rList
|
||||
.stream()
|
||||
.map(r -> r.getSource().getId())
|
||||
.collect(Collectors.toSet())
|
||||
.contains(
|
||||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID,
|
||||
Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5("dh-ch"))));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
rList
|
||||
.stream()
|
||||
.filter(
|
||||
r -> r
|
||||
.getSource()
|
||||
.getId()
|
||||
.equals(
|
||||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID,
|
||||
Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5("clarin"))))
|
||||
.collect(Collectors.toList())
|
||||
.size());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
rList
|
||||
.stream()
|
||||
.filter(
|
||||
r -> r
|
||||
.getTarget()
|
||||
.getId()
|
||||
.equals(
|
||||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID,
|
||||
Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5("clarin"))))
|
||||
.collect(Collectors.toList())
|
||||
.size());
|
||||
|
||||
Set<String> tmp = rList
|
||||
.stream()
|
||||
.filter(
|
||||
r -> r
|
||||
.getSource()
|
||||
.getId()
|
||||
.equals(
|
||||
String
|
||||
.format(
|
||||
"%s|%s::%s", Constants.CONTEXT_ID,
|
||||
Constants.CONTEXT_NS_PREFIX,
|
||||
DHPUtils.md5("clarin"))))
|
||||
.map(r -> r.getTarget().getId())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp.contains("40|corda__h2020::b5a4eb56bf84bef2ebc193306b4d423f") &&
|
||||
tmp.contains("40|corda_______::ef782b2d85676aa3e5a907427feb18c4"));
|
||||
|
||||
rList.forEach(rel -> {
|
||||
if (rel.getSource().getId().startsWith("40|")) {
|
||||
String proj = rel.getSource().getId().substring(3);
|
||||
Assertions.assertTrue(proj.substring(0, proj.indexOf("::")).length() == 12);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue