1
0
Fork 0
This commit is contained in:
Claudio Atzori 2020-03-26 18:26:52 +01:00
commit 77c4294924
88 changed files with 6023 additions and 38 deletions

3
.gitignore vendored
View File

@ -1,9 +1,12 @@
.DS_Store
.idea
*.iws
*.ipr
*.iml
*.ipr
*.iws
*~
.vscode
.classpath
/*/.classpath
/*/*/.classpath

View File

@ -58,6 +58,15 @@
<groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId>
</dependency>
<dependency>
<groupId>com.ximpleware</groupId>
<artifactId>vtd-xml</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.parser.utility;
public class VtdException extends Exception {
public VtdException(final Exception e) {
super(e);
}
public VtdException(final Throwable e) {
super(e);
}
}

View File

@ -0,0 +1,107 @@
package eu.dnetlib.dhp.parser.utility;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDNav;
/**
* Created by sandro on 9/29/16.
*/
public class VtdUtilityParser {
public static List<Node> getTextValuesWithAttributes(final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes)
throws VtdException {
final List<Node> results = new ArrayList<>();
try {
ap.selectXPath(xpath);
while (ap.evalXPath() != -1) {
final Node currentNode = new Node();
int t = vn.getText();
if (t >= 0) {
currentNode.setTextValue(vn.toNormalizedString(t));
}
currentNode.setAttributes(getAttributes(vn, attributes));
results.add(currentNode);
}
return results;
} catch (Exception e) {
throw new VtdException(e);
}
}
private static Map<String, String> getAttributes(final VTDNav vn, final List<String> attributes) {
final Map<String, String> currentAttributes = new HashMap<>();
if (attributes != null) {
attributes.forEach(attributeKey -> {
try {
int attr = vn.getAttrVal(attributeKey);
if (attr > -1) {
currentAttributes.put(attributeKey, vn.toNormalizedString(attr));
}
} catch (Throwable e) {
throw new RuntimeException(e);
}
});
}
return currentAttributes;
}
public static List<String> getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) throws VtdException {
List<String> results = new ArrayList<>();
try {
ap.selectXPath(xpath);
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t > -1) results.add(vn.toNormalizedString(t));
}
return results;
} catch (Exception e) {
throw new VtdException(e);
}
}
public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) throws VtdException {
try {
ap.selectXPath(xpath);
while (ap.evalXPath() != -1) {
int it = nav.getText();
if (it > -1)
return nav.toNormalizedString(it);
}
return null;
} catch (Exception e) {
throw new VtdException(e);
}
}
public static class Node {
private String textValue;
private Map<String, String> attributes;
public String getTextValue() {
return textValue;
}
public void setTextValue(final String textValue) {
this.textValue = textValue;
}
public Map<String, String> getAttributes() {
return attributes;
}
public void setAttributes(final Map<String, String> attributes) {
this.attributes = attributes;
}
}
}

View File

@ -1,5 +1,7 @@
package eu.dnetlib.dhp.utils;
import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex;
@ -56,4 +58,17 @@ public class DHPUtils {
}
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return o.toString();
} catch (Exception e) {
return "";
}
}
}

View File

@ -0,0 +1,24 @@
package eu.dnetlib.scholexplorer.relation;
import java.io.Serializable;
public class RelInfo implements Serializable {
private String original;
private String inverse;
public String getOriginal() {
return original;
}
public void setOriginal(String original) {
this.original = original;
}
public String getInverse() {
return inverse;
}
public void setInverse(String inverse) {
this.inverse = inverse;
}
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.scholexplorer.relation;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.IOUtils;
import java.io.Serializable;
import java.util.HashMap;
public class RelationMapper extends HashMap<String,RelInfo > implements Serializable {
public static RelationMapper load() throws Exception {
final String json = IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json"));
ObjectMapper mapper = new ObjectMapper();
return mapper.readValue(json, RelationMapper.class);
}
}

View File

@ -0,0 +1,158 @@
{
"cites":{
"original":"Cites",
"inverse":"IsCitedBy"
},
"compiles":{
"original":"Compiles",
"inverse":"IsCompiledBy"
},
"continues":{
"original":"Continues",
"inverse":"IsContinuedBy"
},
"derives":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"describes":{
"original":"Describes",
"inverse":"IsDescribedBy"
},
"documents":{
"original":"Documents",
"inverse":"IsDocumentedBy"
},
"hasmetadata":{
"original":"HasMetadata",
"inverse":"IsMetadataOf"
},
"hasassociationwith":{
"original":"HasAssociationWith",
"inverse":"HasAssociationWith"
},
"haspart":{
"original":"HasPart",
"inverse":"IsPartOf"
},
"hasversion":{
"original":"HasVersion",
"inverse":"IsVersionOf"
},
"iscitedby":{
"original":"IsCitedBy",
"inverse":"Cites"
},
"iscompiledby":{
"original":"IsCompiledBy",
"inverse":"Compiles"
},
"iscontinuedby":{
"original":"IsContinuedBy",
"inverse":"Continues"
},
"isderivedfrom":{
"original":"IsDerivedFrom",
"inverse":"IsSourceOf"
},
"isdescribedby":{
"original":"IsDescribedBy",
"inverse":"Describes"
},
"isdocumentedby":{
"original":"IsDocumentedBy",
"inverse":"Documents"
},
"isidenticalto":{
"original":"IsIdenticalTo",
"inverse":"IsIdenticalTo"
},
"ismetadatafor":{
"original":"IsMetadataFor",
"inverse":"IsMetadataOf"
},
"ismetadataof":{
"original":"IsMetadataOf",
"inverse":"IsMetadataFor"
},
"isnewversionof":{
"original":"IsNewVersionOf",
"inverse":"IsPreviousVersionOf"
},
"isobsoletedby":{
"original":"IsObsoletedBy",
"inverse":"Obsoletes"
},
"isoriginalformof":{
"original":"IsOriginalFormOf",
"inverse":"IsVariantFormOf"
},
"ispartof":{
"original":"IsPartOf",
"inverse":"HasPart"
},
"ispreviousversionof":{
"original":"IsPreviousVersionOf",
"inverse":"IsNewVersionOf"
},
"isreferencedby":{
"original":"IsReferencedBy",
"inverse":"References"
},
"isrelatedto":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"isrequiredby":{
"original":"IsRequiredBy",
"inverse":"Requires"
},
"isreviewedby":{
"original":"IsReviewedBy",
"inverse":"Reviews"
},
"issourceof":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"issupplementedby":{
"original":"IsSupplementedBy",
"inverse":"IsSupplementTo"
},
"issupplementto":{
"original":"IsSupplementTo",
"inverse":"IsSupplementedBy"
},
"isvariantformof":{
"original":"IsVariantFormOf",
"inverse":"IsOriginalFormOf"
},
"isversionof":{
"original":"IsVersionOf",
"inverse":"HasVersion"
},
"obsoletes":{
"original":"Obsoletes",
"inverse":"IsObsoletedBy"
},
"references":{
"original":"References",
"inverse":"IsReferencedBy"
},
"requires":{
"original":"Requires",
"inverse":"IsRequiredBy"
},
"related":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"reviews":{
"original":"Reviews",
"inverse":"IsReviewedBy"
},
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
}
}

View File

@ -0,0 +1,15 @@
package eu.dnetlib.scholexplorer.relation;
import org.junit.jupiter.api.Test;
public class RelationMapperTest {
@Test
public void testLoadRels() throws Exception{
RelationMapper relationMapper = RelationMapper.load();
relationMapper.keySet().forEach(System.out::println);
}
}

View File

@ -0,0 +1,158 @@
{
"cites":{
"original":"Cites",
"inverse":"IsCitedBy"
},
"compiles":{
"original":"Compiles",
"inverse":"IsCompiledBy"
},
"continues":{
"original":"Continues",
"inverse":"IsContinuedBy"
},
"derives":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"describes":{
"original":"Describes",
"inverse":"IsDescribedBy"
},
"documents":{
"original":"Documents",
"inverse":"IsDocumentedBy"
},
"hasmetadata":{
"original":"HasMetadata",
"inverse":"IsMetadataOf"
},
"hasassociationwith":{
"original":"HasAssociationWith",
"inverse":"HasAssociationWith"
},
"haspart":{
"original":"HasPart",
"inverse":"IsPartOf"
},
"hasversion":{
"original":"HasVersion",
"inverse":"IsVersionOf"
},
"iscitedby":{
"original":"IsCitedBy",
"inverse":"Cites"
},
"iscompiledby":{
"original":"IsCompiledBy",
"inverse":"Compiles"
},
"iscontinuedby":{
"original":"IsContinuedBy",
"inverse":"Continues"
},
"isderivedfrom":{
"original":"IsDerivedFrom",
"inverse":"IsSourceOf"
},
"isdescribedby":{
"original":"IsDescribedBy",
"inverse":"Describes"
},
"isdocumentedby":{
"original":"IsDocumentedBy",
"inverse":"Documents"
},
"isidenticalto":{
"original":"IsIdenticalTo",
"inverse":"IsIdenticalTo"
},
"ismetadatafor":{
"original":"IsMetadataFor",
"inverse":"IsMetadataOf"
},
"ismetadataof":{
"original":"IsMetadataOf",
"inverse":"IsMetadataFor"
},
"isnewversionof":{
"original":"IsNewVersionOf",
"inverse":"IsPreviousVersionOf"
},
"isobsoletedby":{
"original":"IsObsoletedBy",
"inverse":"Obsoletes"
},
"isoriginalformof":{
"original":"IsOriginalFormOf",
"inverse":"IsVariantFormOf"
},
"ispartof":{
"original":"IsPartOf",
"inverse":"HasPart"
},
"ispreviousversionof":{
"original":"IsPreviousVersionOf",
"inverse":"IsNewVersionOf"
},
"isreferencedby":{
"original":"IsReferencedBy",
"inverse":"References"
},
"isrelatedto":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"isrequiredby":{
"original":"IsRequiredBy",
"inverse":"Requires"
},
"isreviewedby":{
"original":"IsReviewedBy",
"inverse":"Reviews"
},
"issourceof":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"issupplementedby":{
"original":"IsSupplementedBy",
"inverse":"IsSupplementTo"
},
"issupplementto":{
"original":"IsSupplementTo",
"inverse":"IsSupplementedBy"
},
"isvariantformof":{
"original":"IsVariantFormOf",
"inverse":"IsOriginalFormOf"
},
"isversionof":{
"original":"IsVersionOf",
"inverse":"HasVersion"
},
"obsoletes":{
"original":"Obsoletes",
"inverse":"IsObsoletedBy"
},
"references":{
"original":"References",
"inverse":"IsReferencedBy"
},
"requires":{
"original":"Requires",
"inverse":"IsRequiredBy"
},
"related":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"reviews":{
"original":"Reviews",
"inverse":"IsReviewedBy"
},
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
}
}

View File

@ -0,0 +1,80 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DLIDataset extends Dataset {
private String originalObjIdentifier;
private List<ProvenaceInfo> dlicollectedfrom;
private String completionStatus;
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
public List<ProvenaceInfo> getDlicollectedfrom() {
return dlicollectedfrom;
}
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
this.dlicollectedfrom = dlicollectedfrom;
}
public String getOriginalObjIdentifier() {
return originalObjIdentifier;
}
public void setOriginalObjIdentifier(String originalObjIdentifier) {
this.originalObjIdentifier = originalObjIdentifier;
}
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
DLIDataset p = (DLIDataset) e;
if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus))
completionStatus = p.completionStatus;
if ("complete".equalsIgnoreCase(p.completionStatus))
completionStatus = "complete";
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
}
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
Map<String, ProvenaceInfo> result = new HashMap<>();
if (a != null)
a.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
if (b != null)
b.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
return new ArrayList<>(result.values());
}
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.*;
public class DLIPublication extends Publication implements Serializable {
private String originalObjIdentifier;
private List<ProvenaceInfo> dlicollectedfrom;
private String completionStatus;
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
public List<ProvenaceInfo> getDlicollectedfrom() {
return dlicollectedfrom;
}
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
this.dlicollectedfrom = dlicollectedfrom;
}
public String getOriginalObjIdentifier() {
return originalObjIdentifier;
}
public void setOriginalObjIdentifier(String originalObjIdentifier) {
this.originalObjIdentifier = originalObjIdentifier;
}
@Override
public void mergeFrom(OafEntity e) {
super.mergeFrom(e);
DLIPublication p = (DLIPublication) e;
if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus))
completionStatus = p.completionStatus;
if ("complete".equalsIgnoreCase(p.completionStatus))
completionStatus = "complete";
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
}
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
Map<String, ProvenaceInfo> result = new HashMap<>();
if (a != null)
a.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
if (b != null)
b.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
return new ArrayList<>(result.values());
}
}

View File

@ -0,0 +1,108 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class DLIUnknown extends Oaf implements Serializable {
private String id;
private List<StructuredProperty> pid;
private String dateofcollection;
private String dateoftransformation;
private List<ProvenaceInfo> dlicollectedfrom;
private String completionStatus = "incomplete";
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
public List<ProvenaceInfo> getDlicollectedfrom() {
return dlicollectedfrom;
}
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
this.dlicollectedfrom = dlicollectedfrom;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<StructuredProperty> getPid() {
return pid;
}
public void setPid(List<StructuredProperty> pid) {
this.pid = pid;
}
public String getDateofcollection() {
return dateofcollection;
}
public void setDateofcollection(String dateofcollection) {
this.dateofcollection = dateofcollection;
}
public String getDateoftransformation() {
return dateoftransformation;
}
public void setDateoftransformation(String dateoftransformation) {
this.dateoftransformation = dateoftransformation;
}
public void mergeFrom(DLIUnknown p) {
if ("complete".equalsIgnoreCase(p.completionStatus))
completionStatus = "complete";
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
}
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
Map<String, ProvenaceInfo> result = new HashMap<>();
if (a != null)
a.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
if (b != null)
b.forEach(p -> {
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
result.put(p.getId(), p);
}
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
result.put(p.getId(), p);
});
return new ArrayList<>(result.values());
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import java.io.Serializable;
public class ProvenaceInfo implements Serializable {
private String id;
private String name;
private String completionStatus;
private String collectionMode ="collected";
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
public String getCollectionMode() {
return collectionMode;
}
public void setCollectionMode(String collectionMode) {
this.collectionMode = collectionMode;
}
}

View File

@ -0,0 +1,81 @@
package eu.dnetlib.dhp.schema.scholexplorer;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
public class DLItest {
@Test
public void testMergePublication() throws JsonProcessingException {
DLIPublication a1 = new DLIPublication();
a1.setPid(Arrays.asList( createSP("123456","pdb","dnet:pid_types")));
a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle")));
a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd","Zenodo","complete")));
a1.setCompletionStatus("complete");
DLIPublication a = new DLIPublication();
a.setPid(Arrays.asList(createSP("10.11","doi","dnet:pid_types"), createSP("123456","pdb","dnet:pid_types")));
a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle")));
a.setDlicollectedfrom(Arrays.asList(createCollectedFrom("dct","datacite","complete"),createCollectedFrom("dct","datacite","incomplete")));
a.setCompletionStatus("incomplete");
a.mergeFrom(a1);
ObjectMapper mapper = new ObjectMapper();
System.out.println(mapper.writeValueAsString(a));
}
@Test
public void testDeserialization() throws IOException {
final String json ="{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}";
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class);
mapper.enable(SerializationFeature.INDENT_OUTPUT);
System.out.println(mapper.writeValueAsString(dliDataset));
}
private ProvenaceInfo createCollectedFrom(final String id, final String name, final String completionStatus) {
ProvenaceInfo p = new ProvenaceInfo();
p.setId(id);
p.setName(name);
p.setCompletionStatus(completionStatus);
return p;
}
private StructuredProperty createSP(final String value, final String className, final String schemeName) {
StructuredProperty p = new StructuredProperty();
p.setValue(value);
Qualifier schema = new Qualifier();
schema.setClassname(className);
schema.setClassid(className);
schema.setSchemename(schemeName);
schema.setSchemeid(schemeName);
p.setQualifier(schema);
return p;
}
}

View File

@ -105,6 +105,7 @@
<artifactId>mongo-java-driver</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId>

View File

@ -6,9 +6,8 @@
<version>1.1.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId>
<build>
<plugins>
<plugin>

View File

@ -30,7 +30,8 @@ public class MergeAuthorTest {
}).collect(Collectors.toList());
}
@Test
//FIX ME Michele DB this tests doesn't work
//@Test
public void test() throws Exception {
Publication dedup = new Publication();

View File

@ -0,0 +1,57 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-scholexplorer</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,103 @@
package eu.dnetlib.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.*;
import scala.Tuple2;
import java.io.IOException;
public class SparkPropagateRelationsJob {
enum FieldType {
SOURCE,
TARGET
}
final static String SOURCEJSONPATH = "$.source";
final static String TARGETJSONPATH = "$.target";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelationsJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_propagate_relation_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkUpdateEntityJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String relationPath = parser.get("relationPath");
final String mergeRelPath = parser.get("mergeRelPath");
final String targetRelPath = parser.get("targetRelPath");
final Dataset<Relation> merge = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)).where("relClass == 'merges'");
final Dataset<Relation> rels= spark.read().load(relationPath).as(Encoders.bean(Relation.class));
final Dataset<Relation> firstJoin = rels.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer")
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
final Relation mergeRelation = r._2();
final Relation relation = r._1();
if(mergeRelation!= null)
relation.setSource(mergeRelation.getSource());
return relation;
}, Encoders.bean(Relation.class));
final Dataset<Relation> secondJoin = firstJoin.joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer")
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) r -> {
final Relation mergeRelation = r._2();
final Relation relation = r._1();
if (mergeRelation != null )
relation.setTarget(mergeRelation.getSource());
return relation;
}, Encoders.bean(Relation.class));
secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath);
}
private static boolean containsDedup(final String json) {
final String source = DHPUtils.getJPathString(SOURCEJSONPATH, json);
final String target = DHPUtils.getJPathString(TARGETJSONPATH, json);
return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup");
}
private static String replaceField(final String json, final String id, final FieldType type) {
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try {
Relation relation = mapper.readValue(json, Relation.class);
if (relation.getDataInfo() == null)
relation.setDataInfo(new DataInfo());
relation.getDataInfo().setDeletedbyinference(false);
switch (type) {
case SOURCE:
relation.setSource(id);
return mapper.writeValueAsString(relation);
case TARGET:
relation.setTarget(id);
return mapper.writeValueAsString(relation);
default:
throw new IllegalArgumentException("");
}
} catch (IOException e) {
throw new RuntimeException("unable to deserialize json relation: " + json, e);
}
}
}

View File

@ -0,0 +1,93 @@
package eu.dnetlib.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.*;
import scala.Tuple2;
import java.io.IOException;
public class SparkUpdateEntityJob {
final static String IDJSONPATH = "$.id";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntityJob.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_delete_by_inference_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkUpdateEntityJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String entityPath = parser.get("entityPath");
final String mergeRelPath = parser.get("mergeRelPath");
final String dedupRecordPath = parser.get("dedupRecordPath");
final String entity = parser.get("entity");
final String destination = parser.get("targetPath");
final Dataset<Relation> df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
final JavaPairRDD<String, String> mergedIds = df
.where("relClass == 'merges'")
.select(df.col("target"))
.distinct()
.toJavaRDD()
.mapToPair((PairFunction<Row, String, String>) r -> new Tuple2<>(r.getString(0), "d"));
final JavaRDD<String> sourceEntity = sc.textFile(entityPath);
final JavaRDD<String> dedupEntity = sc.textFile(dedupRecordPath);
JavaPairRDD<String, String> entitiesWithId = sourceEntity.mapToPair((PairFunction<String, String, String>) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s));
Class<? extends Oaf> mainClass;
switch (entity) {
case "publication":
mainClass = DLIPublication.class;
break;
case "dataset":
mainClass = DLIDataset.class;
break;
case "unknown":
mainClass = DLIUnknown.class;
break;
default:
throw new IllegalArgumentException("Illegal type " + entity);
}
JavaRDD<String> map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), mainClass) : k._2()._1());
map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class);
}
private static <T extends Oaf> String updateDeletedByInference(final String json, final Class<T> clazz) {
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
try {
Oaf entity = mapper.readValue(json, clazz);
if (entity.getDataInfo()== null)
entity.setDataInfo(new DataInfo());
entity.getDataInfo().setDeletedbyinference(true);
return mapper.writeValueAsString(entity);
} catch (IOException e) {
throw new RuntimeException("Unable to convert json", e);
}
}
}

View File

@ -0,0 +1,38 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "ep",
"paramLongName": "entityPath",
"paramDescription": "the input entity path",
"paramRequired": true
},
{
"paramName": "mr",
"paramLongName": "mergeRelPath",
"paramDescription": "the input path of merge Rel",
"paramRequired": true
},
{
"paramName": "dr",
"paramLongName": "dedupRecordPath",
"paramDescription": "the inputPath of dedup record",
"paramRequired": true
},
{
"paramName": "e",
"paramLongName": "entity",
"paramDescription": "the type of entity",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the targetPath",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "ep",
"paramLongName": "relationPath",
"paramDescription": "the input relation path",
"paramRequired": true
},
{
"paramName": "mr",
"paramLongName": "mergeRelPath",
"paramDescription": "the input path of merge Rel",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetRelPath",
"paramDescription": "the output Rel Path",
"paramRequired": true
}
]

View File

@ -0,0 +1,206 @@
<workflow-app name="Dedup Entities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>entity</name>
<description>the entity that should be processed</description>
</property>
<property>
<name>dedupConf</name>
<description>the dedup Configuration</description>
</property>
<property>
<name>targetPath</name>
<description>the target path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
</parameters>
<start to="DeleteWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DeleteWorkingPath">
<fs>
<delete path='${targetPath}/${entity}'/>
<mkdir path="${targetPath}"/>
<mkdir path="${targetPath}/${entity}"/>
</fs>
<ok to="CreateSimRels"/>
<error to="Kill"/>
</action>
<action name="CreateSimRels">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Similarity Relations</name>
<class>eu.dnetlib.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateConnectedComponents"/>
<error to="Kill"/>
</action>
<action name="CreateConnectedComponents">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Connected Components</name>
<class>eu.dnetlib.dedup.SparkCreateConnectedComponent</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="CreateDedupRecord"/>
<error to="Kill"/>
</action>
<action name="CreateDedupRecord">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create Dedup Record</name>
<class>eu.dnetlib.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--dedupPath</arg><arg>${targetPath}</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupConf</arg><arg>${dedupConf}</arg>
</spark>
<ok to="fixRelation"/>
<error to="Kill"/>
</action>
<action name="fixRelation">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Propagate Dedup Relations</name>
<class>eu.dnetlib.dedup.SparkPropagateRelationsJob</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
<arg>--relationPath</arg><arg>${sourcePath}/relation</arg>
<arg>--targetRelPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>
</spark>
<ok to="updateDeletedByInferenceEntity"/>
<error to="Kill"/>
</action>
<action name="updateDeletedByInferenceEntity">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Update ${entity} and add DedupRecord</name>
<class>eu.dnetlib.dedup.SparkUpdateEntityJob</class>
<jar>dhp-dedup-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg><arg>yarn-cluster</arg>
<arg>--entityPath</arg><arg>${sourcePath}/${entity}</arg>
<arg>--mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>
<arg>--entity</arg><arg>${entity}</arg>
<arg>--dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>
<arg>--targetPath</arg><arg>${targetPath}/${entity}/updated_record</arg>
</spark>
<ok to="replaceEntity"/>
<error to="Kill"/>
</action>
<!-- <action name="updateDeletedByInferenceRelation">-->
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
<!-- <job-tracker>${jobTracker}</job-tracker>-->
<!-- <name-node>${nameNode}</name-node>-->
<!-- <master>yarn-cluster</master>-->
<!-- <mode>cluster</mode>-->
<!-- <name>Update ${entity} set deleted by Inference</name>-->
<!-- <class>eu.dnetlib.dedup.SparkUpdateEntityJob</class>-->
<!-- <jar>dhp-dedup-${projectVersion}.jar</jar>-->
<!-- <spark-opts>-->
<!-- &#45;&#45;executor-memory ${sparkExecutorMemory}-->
<!-- &#45;&#45;driver-memory=${sparkDriverMemory}-->
<!-- ${sparkExtraOPT}-->
<!-- </spark-opts>-->
<!-- <arg>-mt</arg><arg>yarn-cluster</arg>-->
<!-- <arg>&#45;&#45;entityPath</arg><arg>${targetPath}/${entity}/relation_propagated</arg>-->
<!-- <arg>&#45;&#45;mergeRelPath</arg><arg>${targetPath}/${entity}/mergeRel</arg>-->
<!-- <arg>&#45;&#45;entity</arg><arg>relation</arg>-->
<!-- <arg>&#45;&#45;dedupRecordPath</arg><arg>${targetPath}/${entity}/dedup_records</arg>-->
<!-- <arg>&#45;&#45;targetPath</arg><arg>${targetPath}/${entity}/updated_relation</arg>-->
<!-- </spark>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<action name="replaceEntity">
<fs>
<delete path='${sourcePath}/${entity}'/>
<delete path='${sourcePath}/relation'/>
<move source="${targetPath}/${entity}/updated_relation" target="${sourcePath}/relation" />
<move source="${targetPath}/${entity}/updated_record" target="${sourcePath}/${entity}" />
</fs>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,378 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "result",
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "2000",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "200",
"rootBuilder": [
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering": [
{
"name": "ngrampairs",
"fields": [
"title"
],
"params": {
"max": "1",
"ngramLen": "3"
}
},
{
"name": "suffixprefix",
"fields": [
"title"
],
"params": {
"max": "1",
"len": "3"
}
}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "pid",
"type": "JSON",
"path": "$.pid",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[*].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
}
],
"blacklists": {
"title": [
"^Inside Front Cover$",
"^CORR Insights$",
"^Index des notions$",
"^Department of Error.$",
"^Untitled Item$",
"^Department of Error$",
"^Tome II : 1598 à 1605$",
"^(à lexception de roi, prince, royauté, pouvoir, image… qui sont omniprésents)$",
"^Museen und Ausstellungsinstitute in Nürnberg$",
"^Text/Conference Paper$",
"^Table des illustrations$",
"^An Intimate Insight on Psychopathy and a Novel Hermeneutic Psychological Science$",
"^Index des noms$",
"^Reply by Authors.$",
"^Titelblatt - Inhalt$",
"^Index des œuvres,$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\\!?\\:?$",
"^Chronic fatigue syndrome\\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$"
]
},
"synonyms": {}
}
}

View File

@ -1,5 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
@ -11,6 +12,11 @@
<dependencies>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
@ -35,6 +41,14 @@
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
</dependencies>

View File

@ -0,0 +1,103 @@
package eu.dnetlib.dhp.graph;
import com.mongodb.*;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageType;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.bson.Document;
import org.bson.conversions.Bson;
import java.io.IOException;
import java.net.URI;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.stream.Collectors;
public class ImportDataFromMongo {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/import_from_mongo_parameters.json")));
parser.parseArgument(args);
final int port = Integer.parseInt(parser.get("dbport"));
final String host = parser.get("dbhost");
final String format = parser.get("format");
final String layout = parser.get("layout");
final String interpretation = parser.get("interpretation");
final String dbName = parser.get("dbName");
final MongoClient client = new MongoClient(host, port);
MongoDatabase database = client.getDatabase(dbName);
MongoCollection<Document> metadata = database.getCollection("metadata");
MongoCollection<Document> metadataManager = database.getCollection("metadataManager");
final DBObject query = QueryBuilder.start("format").is(format).and("layout").is(layout).and("interpretation").is(interpretation).get();
final List<String> ids = new ArrayList<>();
metadata.find((Bson) query).forEach((Consumer<Document>) document -> ids.add(document.getString("mdId")));
List<String> databaseId = ids.stream().map(it -> getCurrentId(it, metadataManager)).filter(Objects::nonNull).collect(Collectors.toList());
final String hdfsuri = parser.get("namenode");
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsuri);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("HADOOP_USER_NAME", parser.get("user"));
System.setProperty("hadoop.home.dir", "/");
FileSystem.get(URI.create(hdfsuri), conf);
Path hdfswritepath = new Path(parser.get("targetPath"));
final AtomicInteger counter = new AtomicInteger(0);
try (SequenceFile.Writer writer = SequenceFile.createWriter(conf,
SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class))) {
final IntWritable key = new IntWritable(counter.get());
final Text value = new Text();
databaseId.forEach(id -> {
System.out.println("Reading :"+id);
MongoCollection<Document> collection = database.getCollection(id);
collection.find().forEach((Consumer<Document>) document ->
{
key.set(counter.getAndIncrement());
value.set(document.getString("body"));
if (counter.get() % 10000 == 0) {
System.out.println("Added "+counter.get());
}
try {
writer.append(key, value);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
);
});
}
}
private static String getCurrentId(final String mdId, final MongoCollection<Document> metadataManager) {
FindIterable<Document> result = metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get());
final Document item = result.first();
return item == null ? null : item.getString("currentId");
}
}

View File

@ -0,0 +1,101 @@
package eu.dnetlib.dhp.graph.scholexplorer;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.graph.SparkGraphImporterJob;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import net.minidev.json.JSONArray;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
public class SparkExtractEntitiesJob {
final static String IDJSONPATH = "$.id";
final static String SOURCEJSONPATH = "$.source";
final static String TARGETJSONPATH = "$.target";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkExtractEntitiesJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_extract_entities_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkGraphImporterJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String targetPath = parser.get("targetPath");
final String tdir =parser.get("targetDir");
final JavaRDD<String> inputRDD = sc.textFile(inputPath);
List<String> entities = Arrays.stream(parser.get("entities").split(",")).map(String::trim).collect(Collectors.toList());
if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) {
//Extract Dataset
inputRDD.filter(SparkExtractEntitiesJob::isDataset).saveAsTextFile(targetPath + "/dataset/"+tdir, GzipCodec.class);
}
if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) {
//Extract Unknown
inputRDD.filter(SparkExtractEntitiesJob::isUnknown).saveAsTextFile(targetPath + "/unknown/"+tdir, GzipCodec.class);
}
if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) {
//Extract Relation
inputRDD.filter(SparkExtractEntitiesJob::isRelation).saveAsTextFile(targetPath + "/relation/"+tdir, GzipCodec.class);
}
if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) {
//Extract Relation
inputRDD.filter(SparkExtractEntitiesJob::isPublication).saveAsTextFile(targetPath + "/publication/"+tdir, GzipCodec.class);
}
}
public static boolean isDataset(final String json) {
final String id = getJPathString(IDJSONPATH, json);
if (StringUtils.isBlank(id)) return false;
return id.startsWith("60|");
}
public static boolean isPublication(final String json) {
final String id = getJPathString(IDJSONPATH, json);
if (StringUtils.isBlank(id)) return false;
return id.startsWith("50|");
}
public static boolean isUnknown(final String json) {
final String id = getJPathString(IDJSONPATH, json);
if (StringUtils.isBlank(id)) return false;
return id.startsWith("70|");
}
public static boolean isRelation(final String json) {
final String source = getJPathString(SOURCEJSONPATH, json);
final String target = getJPathString(TARGETJSONPATH, json);
return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target);
}
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.graph.scholexplorer;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.graph.SparkGraphImporterJob;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
public class SparkScholexplorerGenerateSimRel {
final static String IDJSONPATH = "$.id";
final static String OBJIDPATH = "$.originalObjIdentifier";
public static void generateDataFrame(final SparkSession spark, final JavaSparkContext sc, final String inputPath, final String targetPath) {
final JavaPairRDD<String, String> datasetSimRel = sc.textFile(inputPath+"/dataset/*")
.mapToPair((PairFunction<String, String, String>) k ->
new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, k),DHPUtils.getJPathString(OBJIDPATH, k)))
.filter(t ->
!StringUtils.substringAfter(t._1(), "|")
.equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::")))
.distinct();
final JavaPairRDD<String, String> publicationSimRel = sc.textFile(inputPath+"/publication/*")
.mapToPair((PairFunction<String, String, String>) k ->
new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, k),DHPUtils.getJPathString(OBJIDPATH, k)))
.filter(t ->
!StringUtils.substringAfter(t._1(), "|")
.equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::")))
.distinct();
JavaRDD<Relation> simRel = datasetSimRel.union(publicationSimRel).map(s -> {
final Relation r = new Relation();
r.setSource(s._1());
r.setTarget(s._2());
r.setRelType("similar");
return r;
}
);
spark.createDataset(simRel.rdd(), Encoders.bean(Relation.class)).distinct().write()
.mode(SaveMode.Overwrite).save(targetPath+"/pid_simRel");
}
}

View File

@ -0,0 +1,52 @@
package eu.dnetlib.dhp.graph.scholexplorer;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.graph.SparkGraphImporterJob;
import eu.dnetlib.dhp.graph.scholexplorer.parser.DatasetScholexplorerParser;
import eu.dnetlib.dhp.graph.scholexplorer.parser.PublicationScholexplorerParser;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
public class SparkScholexplorerGraphImporter {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkScholexplorerGraphImporter.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_scholix_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkGraphImporterJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
RelationMapper relationMapper = RelationMapper.load();
sc.sequenceFile(inputPath, IntWritable.class, Text.class).map(Tuple2::_2).map(Text::toString).repartition(500)
.flatMap((FlatMapFunction<String, Oaf>) record -> {
switch (parser.get("entity")) {
case "dataset":
final DatasetScholexplorerParser d = new DatasetScholexplorerParser();
return d.parseObject(record,relationMapper).iterator();
case "publication":
final PublicationScholexplorerParser p = new PublicationScholexplorerParser();
return p.parseObject(record,relationMapper).iterator();
default:
throw new IllegalArgumentException("wrong values of entities");
}
}).map(k -> {
ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(k);
}).saveAsTextFile(parser.get("targetPath"), GzipCodec.class);
}
}

View File

@ -0,0 +1,185 @@
package eu.dnetlib.dhp.graph.scholexplorer;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.graph.SparkGraphImporterJob;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import eu.dnetlib.dhp.utils.DHPUtils;
import net.minidev.json.JSONArray;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.*;
import scala.Tuple2;
import scala.collection.JavaConverters;
import sun.rmi.log.ReliableLog;
import javax.xml.crypto.Data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
public class SparkScholexplorerMergeEntitiesJob {
final static String IDJSONPATH = "$.id";
final static String SOURCEJSONPATH = "$.source";
final static String TARGETJSONPATH = "$.target";
final static String RELJSONPATH = "$.relType";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkScholexplorerMergeEntitiesJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/merge_entities_scholix_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.config(new SparkConf()
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"))
.appName(SparkGraphImporterJob.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String targetPath = parser.get("targetPath");
final String entity = parser.get("entity");
FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration());
List<Path> subFolder = Arrays.stream(fs.listStatus(new Path(inputPath))).filter(FileStatus::isDirectory).map(FileStatus::getPath).collect(Collectors.toList());
List<JavaRDD<String>> inputRdd = new ArrayList<>();
subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath())));
JavaRDD<String> union = sc.emptyRDD();
for (JavaRDD<String> item : inputRdd) {
union = union.union(item);
}
switch (entity) {
case "dataset":
union.mapToPair((PairFunction<String, String, DLIDataset>) f -> {
final String id = getJPathString(IDJSONPATH, f);
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class));
}).reduceByKey((a, b) -> {
a.mergeFrom(b);
return a;
}).map(item -> {
ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(item._2());
}).saveAsTextFile(targetPath, GzipCodec.class);
break;
case "publication":
union.mapToPair((PairFunction<String, String, DLIPublication>) f -> {
final String id = getJPathString(IDJSONPATH, f);
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class));
}).reduceByKey((a, b) -> {
a.mergeFrom(b);
return a;
}).map(item -> {
ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(item._2());
}).saveAsTextFile(targetPath, GzipCodec.class);
break;
case "unknown":
union.mapToPair((PairFunction<String, String, DLIUnknown>) f -> {
final String id = getJPathString(IDJSONPATH, f);
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class));
}).reduceByKey((a, b) -> {
a.mergeFrom(b);
return a;
}).map(item -> {
ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(item._2());
}).saveAsTextFile(targetPath, GzipCodec.class);
break;
case "relation":
SparkScholexplorerGenerateSimRel.generateDataFrame(spark, sc, inputPath.replace("/relation",""),targetPath.replace("/relation","") );
RDD<Relation> rdd = union.mapToPair((PairFunction<String, String, Relation>) f -> {
final String source = getJPathString(SOURCEJSONPATH, f);
final String target = getJPathString(TARGETJSONPATH, f);
final String reltype = getJPathString(RELJSONPATH, f);
ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
return new Tuple2<>(DHPUtils.md5(String.format("%s::%s::%s", source.toLowerCase(), reltype.toLowerCase(), target.toLowerCase())), mapper.readValue(f, Relation.class));
}).reduceByKey((a, b) -> {
a.mergeFrom(b);
return a;
}).map(Tuple2::_2).rdd();
spark.createDataset(rdd, Encoders.bean(Relation.class)).write().mode(SaveMode.Overwrite).save(targetPath);
Dataset<Relation> rel_ds =spark.read().load(targetPath).as(Encoders.bean(Relation.class));
System.out.println("LOADING PATH :"+targetPath.replace("/relation","")+"/pid_simRel");
Dataset<Relation>sim_ds =spark.read().load(targetPath.replace("/relation","")+"/pid_simRel").as(Encoders.bean(Relation.class));
TargetFunction tf = new TargetFunction();
Dataset<Relation> ids = sim_ds.map(tf, Encoders.bean(Relation.class));
final Dataset<Relation> firstJoin = rel_ds
.joinWith(ids, ids.col("target")
.equalTo(rel_ds.col("source")), "left_outer")
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) s ->
{
if (s._2() != null) {
s._1().setSource(s._2().getSource());
}
return s._1();
}
, Encoders.bean(Relation.class));
Dataset<Relation> secondJoin = firstJoin.joinWith(ids, ids.col("target").equalTo(firstJoin.col("target")),"left_outer")
.map((MapFunction<Tuple2<Relation, Relation>, Relation>) s ->
{
if (s._2() != null) {
s._1().setTarget(s._2().getSource());
}
return s._1();
}
, Encoders.bean(Relation.class));
secondJoin.write().mode(SaveMode.Overwrite).save(targetPath+"_fixed");
FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration());
fileSystem.delete(new Path(targetPath), true);
fileSystem.rename(new Path(targetPath+"_fixed"),new Path(targetPath));
}
}
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
}

View File

@ -0,0 +1,15 @@
package eu.dnetlib.dhp.graph.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
public class TargetFunction implements MapFunction<Relation, Relation> {
@Override
public Relation call(Relation relation) throws Exception {
final String type = StringUtils.substringBefore(relation.getSource(), "|");
relation.setTarget(String.format("%s|%s", type, StringUtils.substringAfter(relation.getTarget(),"::")));
return relation;
}
}

View File

@ -0,0 +1,113 @@
package eu.dnetlib.dhp.graph.scholexplorer.parser;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import javax.xml.stream.XMLStreamReader;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public abstract class AbstractScholexplorerParser {
protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class);
final static Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE);
private List<String> datasetSubTypes = Arrays.asList("dataset", "software", "film", "sound", "physicalobject", "audiovisual", "collection", "other", "study", "metadata");
public abstract List<Oaf> parseObject(final String record, final RelationMapper relMapper);
protected Map<String, String> getAttributes(final XMLStreamReader parser) {
final Map<String, String> attributesMap = new HashMap<>();
for (int i = 0; i < parser.getAttributeCount(); i++) {
attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
}
return attributesMap;
}
protected List<StructuredProperty> extractSubject(List<VtdUtilityParser.Node> subjects) {
final List<StructuredProperty> subjectResult = new ArrayList<>();
if (subjects != null && subjects.size() > 0) {
subjects.forEach(subjectMap -> {
final StructuredProperty subject = new StructuredProperty();
subject.setValue(subjectMap.getTextValue());
final Qualifier schema = new Qualifier();
schema.setClassid("dnet:subject");
schema.setClassname("dnet:subject");
schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme"));
schema.setSchemename(subjectMap.getAttributes().get("subjectScheme"));
subject.setQualifier(schema);
subjectResult.add(subject);
});
}
return subjectResult;
}
protected StructuredProperty extractIdentifier(List<VtdUtilityParser.Node> identifierType, final String fieldName) {
final StructuredProperty pid = new StructuredProperty();
if (identifierType != null && identifierType.size() > 0) {
final VtdUtilityParser.Node result = identifierType.get(0);
pid.setValue(result.getTextValue());
final Qualifier pidType = new Qualifier();
pidType.setClassname(result.getAttributes().get(fieldName));
pidType.setClassid(result.getAttributes().get(fieldName));
pidType.setSchemename("dnet:pid_types");
pidType.setSchemeid("dnet:pid_types");
pid.setQualifier(pidType);
return pid;
}
return null;
}
protected void inferPid(final StructuredProperty input) {
final Matcher matcher = pattern.matcher(input.getValue());
if (matcher.find()) {
input.setValue(matcher.group());
if (input.getQualifier() == null) {
input.setQualifier(new Qualifier());
input.getQualifier().setSchemename("dnet:pid_types");
input.getQualifier().setSchemeid("dnet:pid_types");
}
input.getQualifier().setClassid("doi");
input.getQualifier().setClassname("doi");
}
}
protected String generateId(final String pid, final String pidType, final String entityType) {
String type;
switch (entityType){
case "publication":
type = "50|";
break;
case "dataset":
type = "60|";
break;
case "unknown":
type = "70|";
break;
default:
throw new IllegalArgumentException("unexpected value "+entityType);
}
if ("dnet".equalsIgnoreCase(pidType))
return type+StringUtils.substringAfter(pid, "::");
return type+ DHPUtils.md5(String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim()));
}
}

View File

@ -0,0 +1,289 @@
package eu.dnetlib.dhp.graph.scholexplorer.parser;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
import eu.dnetlib.scholexplorer.relation.RelInfo;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.yarn.webapp.hamlet.Hamlet;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
@Override
public List<Oaf> parseObject(String record, final RelationMapper relationMapper) {
try {
final DLIDataset parsedObject = new DLIDataset();
final VTDGen vg = new VTDGen();
vg.setDoc(record.getBytes());
final List<Oaf> result = new ArrayList<>();
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
DataInfo di = new DataInfo();
di.setTrust("0.9");
di.setDeletedbyinference(false);
di.setInvisible(false);
parsedObject.setDataInfo(di);
parsedObject.setOriginalId(Collections.singletonList(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
parsedObject.setOriginalObjIdentifier(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
parsedObject.setDateofcollection(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"));
final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
if (StringUtils.isNotBlank(resolvedDate)) {
StructuredProperty currentDate = new StructuredProperty();
currentDate.setValue(resolvedDate);
final Qualifier dateQualifier = new Qualifier();
dateQualifier.setClassname("resolvedDate");
dateQualifier.setClassid("resolvedDate");
dateQualifier.setSchemename("dnet::date");
dateQualifier.setSchemeid("dnet::date");
currentDate.setQualifier(dateQualifier);
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
}
final String completionStatus = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']");
List<VtdUtilityParser.Node> collectedFromNodes =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='collectedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
List<VtdUtilityParser.Node> resolvededFromNodes =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resolvedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
Field<String> pf = new Field<>();
pf.setValue(publisher);
parsedObject.setPublisher(pf);
final List<ProvenaceInfo> provenances = new ArrayList<>();
if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
collectedFromNodes.forEach(it -> {
final ProvenaceInfo provenance = new ProvenaceInfo();
provenance.setId(it.getAttributes().get("id"));
provenance.setName(it.getAttributes().get("name"));
provenance.setCollectionMode(provisionMode);
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
provenances.add(provenance);
});
}
if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
resolvededFromNodes.forEach(it -> {
final ProvenaceInfo provenance = new ProvenaceInfo();
provenance.setId(it.getAttributes().get("id"));
provenance.setName(it.getAttributes().get("name"));
provenance.setCollectionMode("resolved");
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
provenances.add(provenance);
});
}
parsedObject.setDlicollectedfrom(provenances);
parsedObject.setCollectedfrom(parsedObject.getDlicollectedfrom().stream().map(
p-> {
final KeyValue cf = new KeyValue();
cf.setKey(p.getId());
cf.setValue(p.getName());
return cf;
}
).collect(Collectors.toList()));
parsedObject.setCompletionStatus(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
final List<Node> identifierType =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resource']/*[local-name()='identifier']", Collections.singletonList("identifierType"));
StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType");
if (currentPid == null) return null;
inferPid(currentPid);
parsedObject.setPid(Collections.singletonList(currentPid));
final String sourceId = generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset");
parsedObject.setId(sourceId);
List<String> descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']");
if (descs != null && descs.size() > 0)
parsedObject.setDescription(descs.stream()
.map(it -> it.length() < 512 ? it : it.substring(0, 512))
.map(it -> {
final Field<String> d = new Field<>();
d.setValue(it);
return d;
})
.collect(Collectors.toList()));
final List<Node> relatedIdentifiers =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='relatedIdentifier']",
Arrays.asList("relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
if(relatedIdentifiers!= null) {
result.addAll(relatedIdentifiers.stream()
.flatMap(n -> {
final List<Relation> rels = new ArrayList<>();
Relation r = new Relation();
r.setSource(parsedObject.getId());
final String relatedPid = n.getTextValue();
final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
String relationSemantic = n.getAttributes().get("relationType");
String inverseRelation = n.getAttributes().get("inverseRelationType");
final String targetId = generateId(relatedPid, relatedPidType, relatedType);
if (relationMapper.containsKey(relationSemantic.toLowerCase()))
{
RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase());
relationSemantic = relInfo.getOriginal();
inverseRelation = relInfo.getInverse();
}
else {
relationSemantic = "Unknown";
inverseRelation = "Unknown";
}
r.setTarget(targetId);
r.setRelType(relationSemantic);
r.setRelClass("datacite");
r.setCollectedFrom(parsedObject.getCollectedfrom());
r.setDataInfo(di);
rels.add(r);
r = new Relation();
r.setDataInfo(di);
r.setSource(targetId);
r.setTarget(parsedObject.getId());
r.setRelType(inverseRelation);
r.setRelClass("datacite");
r.setCollectedFrom(parsedObject.getCollectedfrom());
rels.add(r);
if("unknown".equalsIgnoreCase(relatedType))
result.add(createUnknownObject(relatedPid, relatedPidType, parsedObject.getCollectedfrom().get(0), di));
return rels.stream();
}).collect(Collectors.toList()));
}
final List<Node> hostedBy =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
if (hostedBy != null) {
parsedObject.setInstance(hostedBy.stream().map(it ->
{
final Instance i = new Instance();
i.setUrl(Collections.singletonList(currentPid.getValue()));
KeyValue h = new KeyValue();
i.setHostedby(h);
h.setKey(it.getAttributes().get("id"));
h.setValue(it.getAttributes().get("name"));
return i;
}).collect(Collectors.toList()));
}
List<StructuredProperty> subjects = extractSubject(VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resource']//*[local-name()='subject']", Arrays.asList("subjectScheme")));
parsedObject.setSubject(subjects);
Qualifier q = new Qualifier();
q.setClassname("dataset");
q.setClassid("dataset");
q.setSchemename("dataset");
q.setSchemeid("dataset");
parsedObject.setResulttype(q);
parsedObject.setCompletionStatus(completionStatus);
final List<String> creators = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']");
if (creators != null && creators.size() > 0) {
parsedObject.setAuthor(creators
.stream()
.map(a -> {
final Author author = new Author();
author.setFullname(a);
return author;
}).collect(Collectors.toList())
);
}
final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']//*[local-name()='title']");
if (titles != null && titles.size() > 0) {
parsedObject.setTitle(titles.stream()
.map(t -> {
final StructuredProperty st = new StructuredProperty();
st.setValue(t);
return st;
}
).collect(Collectors.toList())
);
}
final List<String> dates = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']");
if (dates != null && dates.size() > 0) {
parsedObject.setRelevantdate(dates.stream().map(
cd -> {
StructuredProperty date = new StructuredProperty();
date.setValue(cd);
final Qualifier dq = new Qualifier();
dq.setClassname("date");
dq.setClassid("date");
dq.setSchemename("dnet::date");
dq.setSchemeid("dnet::date");
date.setQualifier(dq);
return date;
}
).collect(Collectors.toList()));
}
result.add(parsedObject);
return result;
} catch (Throwable e) {
log.error("Error on parsing record " + record, e);
return null;
}
}
private DLIUnknown createUnknownObject(final String pid, final String pidType, final KeyValue cf, final DataInfo di) {
final DLIUnknown uk = new DLIUnknown();
uk.setId(generateId(pid, pidType, "unknown"));
ProvenaceInfo pi = new ProvenaceInfo();
pi.setId(cf.getKey());
pi.setName(cf.getValue());
pi.setCompletionStatus("incomplete");
uk.setDataInfo(di);
uk.setDlicollectedfrom(Collections.singletonList(pi));
final StructuredProperty sourcePid = new StructuredProperty();
sourcePid.setValue(pid);
final Qualifier pt = new Qualifier();
pt.setClassname(pidType);
pt.setClassid(pidType);
pt.setSchemename("dnet:pid_types");
pt.setSchemeid("dnet:pid_types");
sourcePid.setQualifier(pt);
uk.setPid(Collections.singletonList(sourcePid));
return uk;
}
}

View File

@ -0,0 +1,252 @@
package eu.dnetlib.dhp.graph.scholexplorer.parser;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDGen;
import com.ximpleware.VTDNav;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
import eu.dnetlib.scholexplorer.relation.RelInfo;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
public class PublicationScholexplorerParser extends AbstractScholexplorerParser {
@Override
public List<Oaf> parseObject(final String record, final RelationMapper relationMapper) {
try {
final List<Oaf> result = new ArrayList<>();
final DLIPublication parsedObject = new DLIPublication();
final VTDGen vg = new VTDGen();
vg.setDoc(record.getBytes());
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
final DataInfo di = new DataInfo();
di.setTrust("0.9");
di.setDeletedbyinference(false);
di.setInvisible(false);
parsedObject.setDateofcollection(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"));
final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
parsedObject.setOriginalId(Collections.singletonList(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
if (StringUtils.isNotBlank(resolvedDate)) {
StructuredProperty currentDate = new StructuredProperty();
currentDate.setValue(resolvedDate);
final Qualifier dateQualifier = new Qualifier();
dateQualifier.setClassname("resolvedDate");
dateQualifier.setClassid("resolvedDate");
dateQualifier.setSchemename("dnet::date");
dateQualifier.setSchemeid("dnet::date");
currentDate.setQualifier(dateQualifier);
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
}
final List<Node> pid = VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='pid']", Arrays.asList("type"));
StructuredProperty currentPid = extractIdentifier(pid, "type");
if (currentPid == null) return null;
inferPid(currentPid);
parsedObject.setPid(Collections.singletonList(currentPid));
final String sourceId = generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication");
parsedObject.setId(sourceId);
parsedObject.setOriginalObjIdentifier(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']"));
String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
List<Node> collectedFromNodes =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='collectedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
List<Node> resolvededFromNodes =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resolvedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']");
Field<String> pf = new Field<>();
pf.setValue(publisher);
parsedObject.setPublisher(pf);
final List<ProvenaceInfo> provenances = new ArrayList<>();
if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
collectedFromNodes.forEach(it -> {
final ProvenaceInfo provenance = new ProvenaceInfo();
provenance.setId(it.getAttributes().get("id"));
provenance.setName(it.getAttributes().get("name"));
provenance.setCollectionMode(provisionMode);
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
provenances.add(provenance);
});
}
if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
resolvededFromNodes.forEach(it -> {
final ProvenaceInfo provenance = new ProvenaceInfo();
provenance.setId(it.getAttributes().get("id"));
provenance.setName(it.getAttributes().get("name"));
provenance.setCollectionMode("resolved");
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
provenances.add(provenance);
});
}
parsedObject.setDlicollectedfrom(provenances);
parsedObject.setCompletionStatus(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
parsedObject.setCollectedfrom(parsedObject.getDlicollectedfrom().stream().map(
p -> {
final KeyValue cf = new KeyValue();
cf.setKey(p.getId());
cf.setValue(p.getName());
return cf;
}
).collect(Collectors.toList()));
final List<Node> relatedIdentifiers =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='relatedIdentifier']",
Arrays.asList("relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
if (relatedIdentifiers != null) {
result.addAll(relatedIdentifiers.stream()
.flatMap(n -> {
final List<Relation> rels = new ArrayList<>();
Relation r = new Relation();
r.setSource(parsedObject.getId());
final String relatedPid = n.getTextValue();
final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
String relationSemantic = n.getAttributes().get("relationType");
String inverseRelation = "Unknown";
final String targetId = generateId(relatedPid, relatedPidType, relatedType);
if (relationMapper.containsKey(relationSemantic.toLowerCase()))
{
RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase());
relationSemantic = relInfo.getOriginal();
inverseRelation = relInfo.getInverse();
}
else {
relationSemantic = "Unknown";
}
r.setTarget(targetId);
r.setRelType(relationSemantic);
r.setCollectedFrom(parsedObject.getCollectedfrom());
r.setRelClass("datacite");
r.setDataInfo(di);
rels.add(r);
r = new Relation();
r.setDataInfo(di);
r.setSource(targetId);
r.setTarget(parsedObject.getId());
r.setRelType(inverseRelation);
r.setRelClass("datacite");
r.setCollectedFrom(parsedObject.getCollectedfrom());
rels.add(r);
return rels.stream();
}).collect(Collectors.toList()));
}
final List<Node> hostedBy =
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
if (hostedBy != null) {
parsedObject.setInstance(hostedBy.stream().map(it ->
{
final Instance i = new Instance();
i.setUrl(Collections.singletonList(currentPid.getValue()));
KeyValue h = new KeyValue();
i.setHostedby(h);
h.setKey(it.getAttributes().get("id"));
h.setValue(it.getAttributes().get("name"));
return i;
}).collect(Collectors.toList()));
}
final List<String> authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']");
if (authorsNode != null)
parsedObject.setAuthor(authorsNode
.stream()
.map(a -> {
final Author author = new Author();
author.setFullname(a);
return author;
}).collect(Collectors.toList())
);
final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']");
if (titles != null) {
parsedObject.setTitle(titles.stream()
.map(t -> {
final StructuredProperty st = new StructuredProperty();
st.setValue(t);
return st;
}
).collect(Collectors.toList())
);
}
Field<String> description = new Field<>();
description.setValue(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']"));
if (StringUtils.isNotBlank(description.getValue()) && description.getValue().length() > 512) {
description.setValue(description.getValue().substring(0, 512));
}
parsedObject.setDescription(Collections.singletonList(description));
final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']");
StructuredProperty date = new StructuredProperty();
date.setValue(cd);
final Qualifier dq = new Qualifier();
dq.setClassname("date");
dq.setClassid("date");
dq.setSchemename("dnet::date");
dq.setSchemeid("dnet::date");
date.setQualifier(dq);
parsedObject.setRelevantdate(Collections.singletonList(date));
List<StructuredProperty> subjects = extractSubject(VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme")));
parsedObject.setSubject(subjects);
parsedObject.setDataInfo(di);
parsedObject.setSubject(subjects);
Qualifier q = new Qualifier();
q.setClassname("publication");
q.setClassid("publication");
q.setSchemename("publication");
q.setSchemeid("publication");
parsedObject.setResulttype(q);
result.add(parsedObject);
return result;
} catch (Throwable e) {
log.error("Input record: " + record);
log.error("Error on parsing record ", e);
return null;
}
}
}

View File

@ -0,0 +1,10 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,10 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,75 @@
<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>targetPath</name>
<description>the source path</description>
</property>
<property>
<name>targetDir</name>
<description>the name of the path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>entities</name>
<description>the entities to be extracted</description>
</property>
</parameters>
<start to="DeleteTargetPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DeleteTargetPath">
<fs>
<mkdir path="${targetPath}"/>
<mkdir path="${targetPath}/dataset"/>
<mkdir path="${targetPath}/publication"/>
<mkdir path="${targetPath}/unknown"/>
<mkdir path="${targetPath}/relation"/>
<delete path='${targetPath}/dataset/${targetDir}'/>
<delete path='${targetPath}/publication/${targetDir}'/>
<delete path='${targetPath}/unknown/${targetDir}'/>
<delete path='${targetPath}/relation/${targetDir}'/>
</fs>
<ok to="ExtractDLIEntities"/>
<error to="Kill"/>
</action>
<action name="ExtractDLIEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Extract ${entities}</name>
<class>eu.dnetlib.dhp.graph.scholexplorer.SparkExtractEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--targetDir</arg><arg>${targetDir}</arg>
<arg>--entities</arg><arg>${entities}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,10 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,73 @@
<workflow-app name="import Entities from aggretor to HDFS" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>targetPath</name>
<description>the graph Raw base path</description>
</property>
<property>
<name>format</name>
<description>the postgres URL to access to the database</description>
</property>
<property>
<name>layout</name>
<description>the user postgres</description>
</property>
<property>
<name>interpretation</name>
<description>the password postgres</description>
</property>
<property>
<name>dbhost</name>
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
</property>
<property>
<name>dbName</name>
<description>mongo database</description>
</property>
<property>
<name>user</name>
<description>HDFS user</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${targetPath}'/>
<mkdir path='${workingPath}'/>
</fs>
<ok to="ImportEntitiesFromMongo"/>
<error to="Kill"/>
</action>
<action name="ImportEntitiesFromMongo">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.graph.ImportDataFromMongo</main-class>
<arg>-t</arg><arg>${targetPath}</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-u</arg><arg>${user}</arg>
<arg>-h</arg><arg>${dbhost}</arg>
<arg>-p</arg><arg>27017</arg>
<arg>-dn</arg><arg>${dbName}</arg>
<arg>-f</arg><arg>${format}</arg>
<arg>-l</arg><arg>${layout}</arg>
<arg>-i</arg><arg>${interpretation}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,10 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,61 @@
<workflow-app name="Infospace Merge Entities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>targetPath</name>
<description>the source path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>entity</name>
<description>the entity to be merged</description>
</property>
</parameters>
<start to="DeleteTargetPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DeleteTargetPath">
<fs>
<mkdir path="${targetPath}"/>
<delete path='${targetPath}/${entity}'/>
</fs>
<ok to="MergeDLIEntities"/>
<error to="Kill"/>
</action>
<action name="MergeDLIEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Merge ${entity}</name>
<class>eu.dnetlib.dhp.graph.scholexplorer.SparkScholexplorerMergeEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/${entity}</arg>
<arg>--targetPath</arg><arg>${targetPath}/${entity}</arg>
<arg>--entity</arg><arg>${entity}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,5 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the result data", "paramRequired": true}
]

View File

@ -0,0 +1,12 @@
[
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the name node", "paramRequired": true},
{"paramName":"u", "paramLongName":"user", "paramDescription": "the name node", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the name node", "paramRequired": true},
{"paramName":"h", "paramLongName":"dbhost", "paramDescription": "the mongo host", "paramRequired": true},
{"paramName":"p", "paramLongName":"dbport", "paramDescription": "the mongo port", "paramRequired": true},
{"paramName":"f", "paramLongName":"format", "paramDescription": "the metadata format to import", "paramRequired": true},
{"paramName":"l", "paramLongName":"layout", "paramDescription": "the metadata layout to import", "paramRequired": true},
{"paramName":"i", "paramLongName":"interpretation", "paramDescription": "the metadata interpretation to import", "paramRequired": true},
{"paramName":"dn", "paramLongName":"dbName", "paramDescription": "the database Name", "paramRequired": true}
]

View File

@ -0,0 +1,7 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the result data", "paramRequired": true},
{"paramName":"td", "paramLongName":"targetDir", "paramDescription": "the name of the result data", "paramRequired": true},
{"paramName":"e", "paramLongName":"entities", "paramDescription": "the entity type to be filtered", "paramRequired": true}
]

View File

@ -0,0 +1,6 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the result data", "paramRequired": true},
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the entity type", "paramRequired": true}
]

View File

@ -0,0 +1,6 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the entity type", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the result data", "paramRequired": true}
]

View File

@ -1,30 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hive_db_name</name>
<value>openaire</value>
</property>
</configuration>

View File

@ -0,0 +1,158 @@
{
"cites":{
"original":"Cites",
"inverse":"IsCitedBy"
},
"compiles":{
"original":"Compiles",
"inverse":"IsCompiledBy"
},
"continues":{
"original":"Continues",
"inverse":"IsContinuedBy"
},
"derives":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"describes":{
"original":"Describes",
"inverse":"IsDescribedBy"
},
"documents":{
"original":"Documents",
"inverse":"IsDocumentedBy"
},
"hasmetadata":{
"original":"HasMetadata",
"inverse":"IsMetadataOf"
},
"hasassociationwith":{
"original":"HasAssociationWith",
"inverse":"HasAssociationWith"
},
"haspart":{
"original":"HasPart",
"inverse":"IsPartOf"
},
"hasversion":{
"original":"HasVersion",
"inverse":"IsVersionOf"
},
"iscitedby":{
"original":"IsCitedBy",
"inverse":"Cites"
},
"iscompiledby":{
"original":"IsCompiledBy",
"inverse":"Compiles"
},
"iscontinuedby":{
"original":"IsContinuedBy",
"inverse":"Continues"
},
"isderivedfrom":{
"original":"IsDerivedFrom",
"inverse":"IsSourceOf"
},
"isdescribedby":{
"original":"IsDescribedBy",
"inverse":"Describes"
},
"isdocumentedby":{
"original":"IsDocumentedBy",
"inverse":"Documents"
},
"isidenticalto":{
"original":"IsIdenticalTo",
"inverse":"IsIdenticalTo"
},
"ismetadatafor":{
"original":"IsMetadataFor",
"inverse":"IsMetadataOf"
},
"ismetadataof":{
"original":"IsMetadataOf",
"inverse":"IsMetadataFor"
},
"isnewversionof":{
"original":"IsNewVersionOf",
"inverse":"IsPreviousVersionOf"
},
"isobsoletedby":{
"original":"IsObsoletedBy",
"inverse":"Obsoletes"
},
"isoriginalformof":{
"original":"IsOriginalFormOf",
"inverse":"IsVariantFormOf"
},
"ispartof":{
"original":"IsPartOf",
"inverse":"HasPart"
},
"ispreviousversionof":{
"original":"IsPreviousVersionOf",
"inverse":"IsNewVersionOf"
},
"isreferencedby":{
"original":"IsReferencedBy",
"inverse":"References"
},
"isrelatedto":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"isrequiredby":{
"original":"IsRequiredBy",
"inverse":"Requires"
},
"isreviewedby":{
"original":"IsReviewedBy",
"inverse":"Reviews"
},
"issourceof":{
"original":"IsSourceOf",
"inverse":"IsDerivedFrom"
},
"issupplementedby":{
"original":"IsSupplementedBy",
"inverse":"IsSupplementTo"
},
"issupplementto":{
"original":"IsSupplementTo",
"inverse":"IsSupplementedBy"
},
"isvariantformof":{
"original":"IsVariantFormOf",
"inverse":"IsOriginalFormOf"
},
"isversionof":{
"original":"IsVersionOf",
"inverse":"HasVersion"
},
"obsoletes":{
"original":"Obsoletes",
"inverse":"IsObsoletedBy"
},
"references":{
"original":"References",
"inverse":"IsReferencedBy"
},
"requires":{
"original":"Requires",
"inverse":"IsRequiredBy"
},
"related":{
"original":"IsRelatedTo",
"inverse":"IsRelatedTo"
},
"reviews":{
"original":"Reviews",
"inverse":"IsReviewedBy"
},
"unknown":{
"original":"Unknown",
"inverse":"Unknown"
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.graph.scholexplorer;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature;
import eu.dnetlib.dhp.graph.scholexplorer.parser.DatasetScholexplorerParser;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.scholexplorer.relation.RelationMapper;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import java.util.List;
public class ScholexplorerParserTest {
@Test
public void testDataciteParser() throws Exception {
String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml"));
DatasetScholexplorerParser p = new DatasetScholexplorerParser();
List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
ObjectMapper m = new ObjectMapper();
m.enable(SerializationFeature.INDENT_OUTPUT);
oaves.forEach(oaf -> {
try {
System.out.println(m.writeValueAsString(oaf));
System.out.println("----------------------------");
} catch (JsonProcessingException e) {
}
});
}
}

View File

@ -0,0 +1,11 @@
package eu.dnetlib.dhp.graph.scholexplorer;
public class SparkScholexplorerGraphImporterTest {
}

View File

@ -0,0 +1,8 @@
package eu.dnetlib.dhp.graph.scholexplorer;
public class SparkScholexplorerMergeEntitiesJobTest {
}

View File

@ -0,0 +1,66 @@
<?xml version="1.0" encoding="UTF-8"?>
<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<oai:header>
<dri:repositoryId>aaadf8b3-01a8-4cc2-9964-63cfb19df3b4_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
<dri:recordIdentifier>oai:pangaea.de:doi:10.1594/PANGAEA.821876</dri:recordIdentifier>
<dri:datasourceprefix>r3d100010134</dri:datasourceprefix>
<dri:objIdentifier>r3d100010134::000083be706192d2d839915694ecfd47</dri:objIdentifier>
<dri:resolvedDate>2020-01-08T04:12:12.287</dri:resolvedDate>
<dri:dateOfCollection>2020-01-08T03:24:10.865Z</dri:dateOfCollection>
<oaf:datasourceprefix/>
<identifier>oai:pangaea.de:doi:10.1594/PANGAEA.821876</identifier>
<setSpec>citable</setSpec>
</oai:header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
<identifier identifierType="doi">10.1594/pangaea.821876</identifier>
<creators> <creator><creatorName>Macke, Andreas</creatorName></creator><creator><creatorName>Kalisch, John</creatorName></creator> </creators>
<titles> <title>Total Sky Imager observations during POLARSTERN cruise ANT-XXVI/4 on 2010-05-14 with links to images</title> </titles>
<publisher>PANGAEA - Data Publisher for Earth &amp; Environmental Science</publisher>
<dates>
<date dateType="Collected">2010-05-14T00:13:47/2010-05-14T23:55:47</date>
</dates>
<subjects>
<subject subjectScheme="Parameter">DATE/TIME</subject>
<subject subjectScheme="Parameter">LATITUDE</subject>
<subject subjectScheme="Parameter">LONGITUDE</subject>
<subject subjectScheme="Parameter">Uniform resource locator/link to image</subject>
<subject subjectScheme="Method">Total Sky Imager</subject>
<subject subjectScheme="Campaign">ANT-XXVI/4</subject>
<subject subjectScheme="Basis">Polarstern</subject>
</subjects>
<resourceType resourceTypeGeneral="dataset">dataset</resourceType>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="dnet" relationType="isPartOf" inverseRelationType="hasPart" entityType="dataset">dli_resolver::cf447a378b0b6603593f8b0e57242695</relatedIdentifier>
<relatedIdentifier relatedIdentifierType="URL" relationType="references" inverseRelationType="isReferencedBy" entityType="unknown">http://hs.pangaea.de/images/airphoto/ps/ps75/2010-05-14/ant-xxvi_4_2010-05-14_tsi-images-links.zip</relatedIdentifier>
<relatedIdentifier relatedIdentifierType="dnet" relationType="references" inverseRelationType="isReferencedBy" entityType="publication">dli_resolver::f0f5975d20991cffd222c6002ddd5821</relatedIdentifier>
</relatedIdentifiers>
</resource>
</metadata>
<oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf">
<oaf:datainfo >
<oaf:completionStatus>complete</oaf:completionStatus>
<oaf:collectedFrom id="dli_________::r3d100010134" name="Pangaea" completionStatus="complete"/>
</oaf:datainfo>
</oaf:about>
</oai:record>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,76 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.1.6-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-graph-provision-scholexplorer</artifactId>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.provision
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{coalesce, col, count, lit}
object DatasetJoiner {
def startJoin(spark: SparkSession, relPath:String, targetPath:String) {
val relation = spark.read.load(relPath)
val relatedPublication = relation.where("target like '50%'").groupBy("source").agg(count("target").as("publication")).select(col("source"). alias("p_source"), col("publication"))
val relatedDataset = relation.where("target like '60%'").groupBy("source").agg(count("target").as("dataset")).select(col("source"). alias("d_source"), col("dataset"))
val relatedUnknown = relation.where("target like '70%'").groupBy("source").agg(count("target").as("unknown")).select(col("source"). alias("u_source"), col("unknown"))
val firstJoin = relatedPublication
.join(relatedDataset,col("p_source").equalTo(col("d_source")),"full")
.select(coalesce(col("p_source"), col("d_source")).alias("id"),
col("publication"),
col("dataset"))
.join(relatedUnknown, col("u_source").equalTo(col("id")),"full")
.select(coalesce(col("u_source"), col("id")).alias("source"),
coalesce(col("publication"),lit(0)).alias("relatedPublication"),
coalesce(col("dataset"),lit(0)).alias("relatedDataset"),
coalesce(col("unknown"),lit(0)).alias("relatedUnknown")
)
firstJoin.write.mode("overwrite").save(targetPath)
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.provision;
import eu.dnetlib.dhp.provision.scholix.summary.Typology;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.lang3.StringUtils;
public class ProvisionUtil {
public final static String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference";
public final static String TARGETJSONPATH = "$.target";
public final static String SOURCEJSONPATH = "$.source";
// public static RelatedItemInfo getItemType(final String item, final String idPath) {
// String targetId = DHPUtils.getJPathString(idPath, item);
// switch (StringUtils.substringBefore(targetId, "|")) {
// case "50":
// return new RelatedItemInfo(null,0,1,0);
// case "60":
// return new RelatedItemInfo(null,1,0,0);
// case "70":
// return new RelatedItemInfo(null,0,0,1);
// default:
// throw new RuntimeException("Unknonw target ID");
//
// }
//
// }
public static Boolean isNotDeleted(final String item) {
return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item));
}
public static Typology getItemTypeFromId(String id) {
switch (StringUtils.substringBefore(id, "|")) {
case "50":
return Typology.publication;
case "60":
return Typology.dataset;
case "70":
return Typology.unknown;
default:
throw new RuntimeException("Unknonw ID type");
}
}
}

View File

@ -0,0 +1,60 @@
package eu.dnetlib.dhp.provision;
import java.io.Serializable;
/**
* This class models the information of related items
*/
public class RelatedItemInfo implements Serializable {
private String source;
private long relatedDataset = 0;
private long relatedPublication = 0;
private long relatedUnknown = 0;
public RelatedItemInfo() {
}
public RelatedItemInfo(String source, long relatedDataset, long relatedPublication, long relatedUnknown) {
this.source = source;
this.relatedDataset = relatedDataset;
this.relatedPublication = relatedPublication;
this.relatedUnknown = relatedUnknown;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public long getRelatedDataset() {
return relatedDataset;
}
public void setRelatedDataset(long relatedDataset) {
this.relatedDataset = relatedDataset;
}
public long getRelatedPublication() {
return relatedPublication;
}
public void setRelatedPublication(long relatedPublication) {
this.relatedPublication = relatedPublication;
}
public long getRelatedUnknown() {
return relatedUnknown;
}
public void setRelatedUnknown(int relatedUnknown) {
this.relatedUnknown = relatedUnknown;
}
}

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.catalyst.expressions.Expression;
import scala.Tuple2;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
/**
* SparkExtractRelationCount is a spark job that takes in input relation RDD
* and retrieve for each item in relation which are the number of
* - Related Dataset
* - Related Publication
* - Related Unknown
*/
public class SparkExtractRelationCount {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkExtractRelationCount.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_related_entities_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkExtractRelationCount.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final String workingDirPath = parser.get("workingDirPath");
final String relationPath = parser.get("relationPath");
DatasetJoiner.startJoin(spark, relationPath,workingDirPath + "/relatedItemCount");
// sc.textFile(relationPath)
// // We start to Filter the relation not deleted by Inference
// .filter(ProvisionUtil::isNotDeleted)
// // Then we create a PairRDD<String, RelatedItem>
// .mapToPair((PairFunction<String, String, RelatedItemInfo>) f
// -> new Tuple2<>(DHPUtils.getJPathString(ProvisionUtil.SOURCEJSONPATH, f), ProvisionUtil.getItemType(f, ProvisionUtil.TARGETJSONPATH)))
// //We reduce and sum the number of Relations
// .reduceByKey((Function2<RelatedItemInfo, RelatedItemInfo, RelatedItemInfo>) (v1, v2) -> {
// if (v1 == null && v2 == null)
// return new RelatedItemInfo();
// return v1 != null ? v1.add(v2) : v2;
// })
// //Set the source Id in RelatedItem object
// .map(k -> k._2().setId(k._1()))
// // Convert to JSON and save as TextFile
// .map(k -> {
// ObjectMapper mapper = new ObjectMapper();
// return mapper.writeValueAsString(k);
// }).saveAsTextFile(workingDirPath + "/relatedItemCount", GzipCodec.class);
}
}

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.provision.scholix.*;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
public class SparkGenerateScholix {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholix.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")));
parser.parseArgument(args);
SparkConf conf = new SparkConf();
conf.set("spark.sql.shuffle.partitions","4000");
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
final SparkSession spark = SparkSession
.builder()
.config(conf)
.appName(SparkExtractRelationCount.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
conf.registerKryoClasses(new Class[]{
Scholix.class,
ScholixCollectedFrom.class,
ScholixEntityId.class,
ScholixIdentifier.class,
ScholixRelationship.class,
ScholixResource.class
});
final String graphPath = parser.get("graphPath");
final String workingDirPath = parser.get("workingDirPath");
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final Dataset<ScholixSummary> scholixSummary = spark.read().load(workingDirPath + "/summary").as(Encoders.bean(ScholixSummary.class));
final Dataset<Relation> rels = spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class));
Dataset<Scholix> firstJoin = scholixSummary.joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source")))
.map((MapFunction<Tuple2<ScholixSummary, Relation>, Scholix>) f -> Scholix.generateScholixWithSource(f._1(), f._2()), Encoders.bean(Scholix.class));
firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath+"/scholix_1");
Dataset<Scholix> scholix_final = spark.read().load(workingDirPath+"/scholix_1").as(Encoders.bean(Scholix.class));
scholixSummary
.map((MapFunction<ScholixSummary, ScholixResource>) ScholixResource::fromSummary, Encoders.bean(ScholixResource.class))
.repartition(1000)
.write()
.mode(SaveMode.Overwrite)
.save(workingDirPath+"/scholix_target");
Dataset<ScholixResource> target = spark.read().load(workingDirPath+"/scholix_target").as(Encoders.bean(ScholixResource.class));
scholix_final.joinWith(target, scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), "inner")
.map((MapFunction<Tuple2<Scholix, ScholixResource>, Scholix>) f -> {
final Scholix scholix = f._1();
final ScholixResource scholixTarget = f._2();
scholix.setTarget(scholixTarget);
scholix.generateIdentifier();
scholix.generatelinkPublisher();
return scholix;
}, Encoders.kryo(Scholix.class)).javaRDD().map(s-> {
ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(s);
}).saveAsTextFile(workingDirPath+"/scholix_json", GzipCodec.class);
}
}

View File

@ -0,0 +1,88 @@
package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import eu.dnetlib.dhp.utils.DHPUtils;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
public class SparkGenerateSummary {
private static final String jsonIDPath = "$.id";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateSummary.class.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkExtractRelationCount.class.getSimpleName())
.master(parser.get("master"))
.getOrCreate();
final String graphPath = parser.get("graphPath");
final String workingDirPath = parser.get("workingDirPath");
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
Dataset<RelatedItemInfo> rInfo = spark.read().load(workingDirPath + "/relatedItemCount").as(Encoders.bean(RelatedItemInfo.class));
Dataset<ScholixSummary> entity = spark.createDataset(sc.textFile(graphPath + "/publication," + graphPath + "/dataset," + graphPath + "/unknown")
.map(s ->
ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(DHPUtils.getJPathString(jsonIDPath, s)), s)
).rdd(), Encoders.bean(ScholixSummary.class));
Dataset<ScholixSummary> summaryComplete = rInfo.joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))).map((MapFunction<Tuple2<RelatedItemInfo, ScholixSummary>, ScholixSummary>) t ->
{
ScholixSummary scholixSummary = t._2();
RelatedItemInfo relatedItemInfo = t._1();
scholixSummary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
scholixSummary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
scholixSummary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
return scholixSummary;
}, Encoders.bean(ScholixSummary.class)
);
summaryComplete.write().save(workingDirPath+"/summary");
// JavaPairRDD<String, String> relationCount = sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i));
//
// JavaPairRDD<String, String> entities =
// sc.textFile(graphPath + "/publication")
// .filter(ProvisionUtil::isNotDeleted)
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
// .union(
// sc.textFile(graphPath + "/dataset")
// .filter(ProvisionUtil::isNotDeleted)
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
// )
// .union(
// sc.textFile(graphPath + "/unknown")
// .filter(ProvisionUtil::isNotDeleted)
// .mapToPair((PairFunction<String, String, String>) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i))
// );
// entities.join(relationCount).map((Function<Tuple2<String, Tuple2<String, String>>, String>) k ->
// ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class);
//
//
// ;
}
}

View File

@ -0,0 +1,66 @@
package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.provision.scholix.Scholix;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import java.nio.file.attribute.AclFileAttributeView;
import java.util.HashMap;
import java.util.Map;
public class SparkIndexCollectionOnES {
public static void main(String[] args) throws Exception{
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkIndexCollectionOnES.class.getResourceAsStream("/eu/dnetlib/dhp/provision/index_on_es.json")));
parser.parseArgument(args);
SparkConf conf = new SparkConf().setAppName(SparkIndexCollectionOnES.class.getSimpleName())
.setMaster(parser.get("master"));
conf.set("spark.sql.shuffle.partitions","4000");
final String sourcePath = parser.get("sourcePath");
final String index = parser.get("index");
final String idPath = parser.get("idPath");
final String type = parser.get("type");
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<String> inputRdd;
if("summary".equalsIgnoreCase(type))
inputRdd = spark.read().load(sourcePath).as(Encoders.bean(ScholixSummary.class)).map((MapFunction<ScholixSummary, String>) f -> {
final ObjectMapper mapper = new ObjectMapper();
return mapper.writeValueAsString(f);
}, Encoders.STRING()).javaRDD();
else
inputRdd = sc.textFile(sourcePath);
Map<String, String> esCfg = new HashMap<>();
esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
esCfg.put("es.mapping.id", idPath);
esCfg.put("es.batch.write.retry.count", "8");
esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true");
JavaEsSpark.saveJsonToEs(inputRdd,index, esCfg);
}
}

View File

@ -0,0 +1,163 @@
package eu.dnetlib.dhp.provision.scholix;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
public class Scholix implements Serializable {
private String publicationDate;
private List<ScholixEntityId> publisher;
private List<ScholixEntityId> linkprovider;
private ScholixRelationship relationship;
private ScholixResource source;
private ScholixResource target;
private String identifier;
public Scholix clone(final ScholixResource t) {
final Scholix clone = new Scholix();
clone.setPublicationDate(publicationDate);
clone.setPublisher(publisher);
clone.setLinkprovider(linkprovider);
clone.setRelationship(relationship);
clone.setSource(source);
clone.setTarget(t);
clone.generatelinkPublisher();
clone.generateIdentifier();
return clone;
}
public static Scholix generateScholixWithSource(final String sourceSummaryJson, final String relation) {
final ObjectMapper mapper = new ObjectMapper();
try {
ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class);
Relation rel = mapper.readValue(relation, Relation.class);
final Scholix s = new Scholix();
if (scholixSummary.getDate() != null && scholixSummary.getDate().size()>0)
s.setPublicationDate(scholixSummary.getDate().get(0));
s.setLinkprovider(rel.getCollectedFrom().stream().map(cf ->
new ScholixEntityId(cf.getValue(), Collections.singletonList(
new ScholixIdentifier(cf.getKey(), "dnet_identifier")
))).collect(Collectors.toList()));
s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null ));
s.setSource(ScholixResource.fromSummary(scholixSummary));
return s;
} catch (Throwable e) {
throw new RuntimeException(String.format("Summary: %s \n relation:%s",sourceSummaryJson, relation), e);
}
}
public static Scholix generateScholixWithSource(final ScholixSummary scholixSummary, final Relation rel) {
final Scholix s = new Scholix();
if (scholixSummary.getDate() != null && scholixSummary.getDate().size()>0)
s.setPublicationDate(scholixSummary.getDate().get(0));
s.setLinkprovider(rel.getCollectedFrom().stream().map(cf ->
new ScholixEntityId(cf.getValue(), Collections.singletonList(
new ScholixIdentifier(cf.getKey(), "dnet_identifier")
))).collect(Collectors.toList()));
s.setRelationship(new ScholixRelationship(rel.getRelType(),rel.getRelClass(),null ));
s.setSource(ScholixResource.fromSummary(scholixSummary));
s.setIdentifier(rel.getTarget());
// ScholixResource mockTarget = new ScholixResource();
// mockTarget.setDnetIdentifier(rel.getTarget());
// s.setTarget(mockTarget);
// s.generateIdentifier();
return s;
}
public void generatelinkPublisher() {
Set<String> publisher = new HashSet<>();
if (source.getPublisher() != null)
publisher.addAll(source.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList()));
if (target.getPublisher() != null)
publisher.addAll(target.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList()));
this.publisher = publisher.stream().map(k -> new ScholixEntityId(k ,null)).collect(Collectors.toList());
}
public void generateIdentifier( ) {
setIdentifier(DHPUtils.md5(String.format("%s::%s::%s",source.getDnetIdentifier(),relationship.getName(), target.getDnetIdentifier())));
}
public Scholix addTarget(final String targetSummaryJson) {
final ObjectMapper mapper = new ObjectMapper();
try {
ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class);
setTarget(ScholixResource.fromSummary(targetSummary));
generateIdentifier();
return this;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public String getPublicationDate() {
return publicationDate;
}
public void setPublicationDate(String publicationDate) {
this.publicationDate = publicationDate;
}
public List<ScholixEntityId> getPublisher() {
return publisher;
}
public void setPublisher(List<ScholixEntityId> publisher) {
this.publisher = publisher;
}
public List<ScholixEntityId> getLinkprovider() {
return linkprovider;
}
public void setLinkprovider(List<ScholixEntityId> linkprovider) {
this.linkprovider = linkprovider;
}
public ScholixRelationship getRelationship() {
return relationship;
}
public void setRelationship(ScholixRelationship relationship) {
this.relationship = relationship;
}
public ScholixResource getSource() {
return source;
}
public void setSource(ScholixResource source) {
this.source = source;
}
public ScholixResource getTarget() {
return target;
}
public void setTarget(ScholixResource target) {
this.target = target;
}
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
public class ScholixCollectedFrom implements Serializable {
private ScholixEntityId provider;
private String provisionMode;
private String completionStatus;
public ScholixCollectedFrom() {
}
public ScholixCollectedFrom(ScholixEntityId provider, String provisionMode, String completionStatus) {
this.provider = provider;
this.provisionMode = provisionMode;
this.completionStatus = completionStatus;
}
public ScholixEntityId getProvider() {
return provider;
}
public void setProvider(ScholixEntityId provider) {
this.provider = provider;
}
public String getProvisionMode() {
return provisionMode;
}
public void setProvisionMode(String provisionMode) {
this.provisionMode = provisionMode;
}
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
import java.util.List;
public class ScholixEntityId implements Serializable {
private String name;
private List<ScholixIdentifier> identifiers;
public ScholixEntityId() {
}
public ScholixEntityId(String name, List<ScholixIdentifier> identifiers) {
this.name = name;
this.identifiers = identifiers;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public List<ScholixIdentifier> getIdentifiers() {
return identifiers;
}
public void setIdentifiers(List<ScholixIdentifier> identifiers) {
this.identifiers = identifiers;
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
public class ScholixIdentifier implements Serializable {
private String identifier;
private String schema;
public ScholixIdentifier() {
}
public ScholixIdentifier(String identifier, String schema) {
this.identifier = identifier;
this.schema = schema;
}
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public String getSchema() {
return schema;
}
public void setSchema(String schema) {
this.schema = schema;
}
}

View File

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.provision.scholix;
import java.io.Serializable;
public class ScholixRelationship implements Serializable {
private String name;
private String schema;
private String inverse;
public ScholixRelationship() {
}
public ScholixRelationship(String name, String schema, String inverse) {
this.name = name;
this.schema = schema;
this.inverse = inverse;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSchema() {
return schema;
}
public void setSchema(String schema) {
this.schema = schema;
}
public String getInverse() {
return inverse;
}
public void setInverse(String inverse) {
this.inverse = inverse;
}
}

View File

@ -0,0 +1,139 @@
package eu.dnetlib.dhp.provision.scholix;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
public class ScholixResource implements Serializable {
private List<ScholixIdentifier> identifier;
private String dnetIdentifier;
private String objectType;
private String objectSubType;
private String title;
private List<ScholixEntityId> creator;
private String publicationDate;
private List<ScholixEntityId> publisher;
private List<ScholixCollectedFrom> collectedFrom;
public static ScholixResource fromSummary(ScholixSummary summary) {
final ScholixResource resource = new ScholixResource();
resource.setDnetIdentifier(summary.getId());
resource.setIdentifier(summary.getLocalIdentifier().stream()
.map(i ->
new ScholixIdentifier(i.getId(), i.getType()))
.collect(Collectors.toList()));
resource.setObjectType(summary.getTypology().toString());
if (summary.getTitle() != null && summary.getTitle().size()>0)
resource.setTitle(summary.getTitle().get(0));
if (summary.getAuthor() != null)
resource.setCreator(summary.getAuthor().stream()
.map(c -> new ScholixEntityId(c, null))
.collect(Collectors.toList())
);
if (summary.getDate() != null && summary.getDate().size()>0)
resource.setPublicationDate(summary.getDate().get(0));
if (summary.getPublisher() != null)
resource.setPublisher(summary.getPublisher().stream()
.map(p -> new ScholixEntityId(p, null))
.collect(Collectors.toList())
);
if (summary.getDatasources() != null)
resource.setCollectedFrom(summary.getDatasources().stream()
.map(d ->
new ScholixCollectedFrom(new ScholixEntityId(d.getDatasourceName(),
Collections.singletonList(new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))
), "collected", d.getCompletionStatus()))
.collect(Collectors.toList()));
return resource;
}
public List<ScholixIdentifier> getIdentifier() {
return identifier;
}
public void setIdentifier(List<ScholixIdentifier> identifier) {
this.identifier = identifier;
}
public String getDnetIdentifier() {
return dnetIdentifier;
}
public void setDnetIdentifier(String dnetIdentifier) {
this.dnetIdentifier = dnetIdentifier;
}
public String getObjectType() {
return objectType;
}
public void setObjectType(String objectType) {
this.objectType = objectType;
}
public String getObjectSubType() {
return objectSubType;
}
public void setObjectSubType(String objectSubType) {
this.objectSubType = objectSubType;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public List<ScholixEntityId> getCreator() {
return creator;
}
public void setCreator(List<ScholixEntityId> creator) {
this.creator = creator;
}
public String getPublicationDate() {
return publicationDate;
}
public void setPublicationDate(String publicationDate) {
this.publicationDate = publicationDate;
}
public List<ScholixEntityId> getPublisher() {
return publisher;
}
public void setPublisher(List<ScholixEntityId> publisher) {
this.publisher = publisher;
}
public List<ScholixCollectedFrom> getCollectedFrom() {
return collectedFrom;
}
public void setCollectedFrom(List<ScholixCollectedFrom> collectedFrom) {
this.collectedFrom = collectedFrom;
}
}

View File

@ -0,0 +1,44 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
public class CollectedFromType implements Serializable {
private String datasourceName;
private String datasourceId;
private String completionStatus;
public CollectedFromType() {
}
public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) {
this.datasourceName = datasourceName;
this.datasourceId = datasourceId;
this.completionStatus = completionStatus;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(String datasourceName) {
this.datasourceName = datasourceName;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
public String getCompletionStatus() {
return completionStatus;
}
public void setCompletionStatus(String completionStatus) {
this.completionStatus = completionStatus;
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
public class SchemeValue implements Serializable {
private String scheme;
private String value;
public SchemeValue() {
}
public SchemeValue(String scheme, String value) {
this.scheme = scheme;
this.value = value;
}
public String getScheme() {
return scheme;
}
public void setScheme(String scheme) {
this.scheme = scheme;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -0,0 +1,309 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.RelatedItemInfo;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import java.io.Serializable;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
public class ScholixSummary implements Serializable {
private String id;
private List<TypedIdentifier> localIdentifier;
private Typology typology;
private List<String> title;
private List<String> author;
private List<String> date;
private String description;
private List<SchemeValue> subject;
private List<String> publisher;
private long relatedPublications;
private long relatedDatasets;
private long relatedUnknown;
private List<CollectedFromType> datasources;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<TypedIdentifier> getLocalIdentifier() {
return localIdentifier;
}
public void setLocalIdentifier(List<TypedIdentifier> localIdentifier) {
this.localIdentifier = localIdentifier;
}
public Typology getTypology() {
return typology;
}
public void setTypology(Typology typology) {
this.typology = typology;
}
public List<String> getTitle() {
return title;
}
public void setTitle(List<String> title) {
this.title = title;
}
public List<String> getAuthor() {
return author;
}
public void setAuthor(List<String> author) {
this.author = author;
}
public List<String> getDate() {
return date;
}
public void setDate(List<String> date) {
this.date = date;
}
@JsonProperty("abstract")
public String getDescription() {
return description;
}
@JsonProperty("abstract")
public void setDescription(String description) {
this.description = description;
}
public List<SchemeValue> getSubject() {
return subject;
}
public void setSubject(List<SchemeValue> subject) {
this.subject = subject;
}
public List<String> getPublisher() {
return publisher;
}
public void setPublisher(List<String> publisher) {
this.publisher = publisher;
}
public long getRelatedPublications() {
return relatedPublications;
}
public void setRelatedPublications(long relatedPublications) {
this.relatedPublications = relatedPublications;
}
public long getRelatedDatasets() {
return relatedDatasets;
}
public void setRelatedDatasets(long relatedDatasets) {
this.relatedDatasets = relatedDatasets;
}
public long getRelatedUnknown() {
return relatedUnknown;
}
public void setRelatedUnknown(long relatedUnknown) {
this.relatedUnknown = relatedUnknown;
}
public List<CollectedFromType> getDatasources() {
return datasources;
}
public void setDatasources(List<CollectedFromType> datasources) {
this.datasources = datasources;
}
public static ScholixSummary fromJsonOAF(final Typology oafType, final String oafJson) {
try {
final ObjectMapper mapper = new ObjectMapper();
final RelatedItemInfo relatedItemInfo = new RelatedItemInfo();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
switch (oafType) {
case dataset:
return summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo);
case publication:
return summaryFromPublication(mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo);
case unknown:
return summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo);
}
} catch (Throwable e) {
throw new RuntimeException(e);
}
return null;
}
public static String fromJsonOAF(final Typology oafType, final String oafJson, final String relEntityJson) {
try {
final ObjectMapper mapper = new ObjectMapper();
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
RelatedItemInfo relatedItemInfo = mapper.readValue(relEntityJson, RelatedItemInfo.class);
switch (oafType) {
case dataset:
return mapper.writeValueAsString(summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo));
case publication:
return mapper.writeValueAsString(summaryFromPublication(mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo));
case unknown:
return mapper.writeValueAsString(summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo));
}
} catch (Throwable e) {
throw new RuntimeException(e);
}
return null;
}
private static ScholixSummary summaryFromDataset(final DLIDataset item, final RelatedItemInfo relatedItemInfo) {
ScholixSummary summary = new ScholixSummary();
summary.setId(item.getId());
if (item.getPid() != null)
summary.setLocalIdentifier(item.getPid().stream()
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
.collect(Collectors.toList())
);
summary.setTypology(Typology.dataset);
if (item.getTitle() != null)
summary.setTitle(item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
if (item.getAuthor() != null) {
summary.setAuthor(item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList()));
}
if (item.getRelevantdate() != null)
summary.setDate(
item.getRelevantdate().stream()
.filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname()))
.map(StructuredProperty::getValue)
.collect(Collectors.toList())
);
if (item.getDescription() != null && item.getDescription().size() > 0)
summary.setDescription(item.getDescription().get(0).getValue());
if (item.getSubject() != null) {
summary.setSubject(item.getSubject().stream()
.map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue()))
.collect(Collectors.toList())
);
}
if (item.getPublisher()!= null)
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
if (item.getDlicollectedfrom() != null)
summary.setDatasources(item.getDlicollectedfrom().stream()
.map(
c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())
).collect(Collectors.toList()));
return summary;
}
private static ScholixSummary summaryFromPublication(final DLIPublication item, final RelatedItemInfo relatedItemInfo) {
ScholixSummary summary = new ScholixSummary();
summary.setId(item.getId());
if (item.getPid() != null)
summary.setLocalIdentifier(item.getPid().stream()
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
.collect(Collectors.toList())
);
summary.setTypology(Typology.publication);
if (item.getTitle() != null)
summary.setTitle(item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
if (item.getAuthor() != null) {
summary.setAuthor(item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList()));
}
if (item.getRelevantdate() != null)
summary.setDate(
item.getRelevantdate().stream()
.filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname()))
.map(StructuredProperty::getValue)
.collect(Collectors.toList())
);
if (item.getDescription() != null && item.getDescription().size() > 0)
summary.setDescription(item.getDescription().get(0).getValue());
if (item.getSubject() != null) {
summary.setSubject(item.getSubject().stream()
.map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue()))
.collect(Collectors.toList())
);
}
if (item.getPublisher()!= null)
summary.setPublisher(Collections.singletonList(item.getPublisher().getValue()));
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
if (item.getDlicollectedfrom() != null)
summary.setDatasources(item.getDlicollectedfrom().stream()
.map(
c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())
).collect(Collectors.toList()));
return summary;
}
private static ScholixSummary summaryFromUnknown(final DLIUnknown item, final RelatedItemInfo relatedItemInfo) {
ScholixSummary summary = new ScholixSummary();
summary.setId(item.getId());
if (item.getPid() != null)
summary.setLocalIdentifier(item.getPid().stream()
.map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid()))
.collect(Collectors.toList())
);
summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset());
summary.setRelatedPublications(relatedItemInfo.getRelatedPublication());
summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown());
summary.setTypology(Typology.unknown);
if (item.getDlicollectedfrom() != null)
summary.setDatasources(item.getDlicollectedfrom().stream()
.map(
c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())
).collect(Collectors.toList()));
return summary;
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
public class TypedIdentifier implements Serializable {
private String id;
private String type;
public TypedIdentifier() {
}
public TypedIdentifier(String id, String type) {
this.id = id;
this.type = type;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}

View File

@ -0,0 +1,9 @@
package eu.dnetlib.dhp.provision.scholix.summary;
import java.io.Serializable;
public enum Typology implements Serializable {
dataset,
publication,
unknown
}

View File

@ -0,0 +1,10 @@
<configuration>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,147 @@
<workflow-app name="Materialize and Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingDirPath</name>
<description>the source path</description>
</property>
<property>
<name>graphPath</name>
<description>the graph path</description>
</property>
<property>
<name>index</name>
<description>index name</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>idScholix</name>
<description>the </description>
</property>
<property>
<name>idSummary</name>
<description>number of cores used by single executor</description>
</property>
</parameters>
<start to="indexSummary"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DeleteTargetPath">
<fs>
<delete path='${workingDirPath}'/>
<mkdir path='${workingDirPath}'/>
</fs>
<ok to="CalculateRelatedItem"/>
<error to="Kill"/>
</action>
<action name="CalculateRelatedItem">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>calculate for each ID the number of related Dataset, publication and Unknown</name>
<class>eu.dnetlib.dhp.provision.SparkExtractRelationCount</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--relationPath</arg><arg>${graphPath}/relation</arg>
</spark>
<ok to="generateSummary"/>
<error to="Kill"/>
</action>
<action name="generateSummary">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>generate Summary</name>
<class>eu.dnetlib.dhp.provision.SparkGenerateSummary</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}</arg>
</spark>
<ok to="generateScholix"/>
<error to="Kill"/>
</action>
<action name="generateScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>generate Scholix</name>
<class>eu.dnetlib.dhp.provision.SparkGenerateScholix</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory 6G --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingDirPath</arg><arg>${workingDirPath}</arg>
<arg>--graphPath</arg><arg>${graphPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="indexSummary">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>index Summary</name>
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="64" </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
<arg>--index</arg><arg>${index}_object</arg>
<arg>--idPath</arg><arg>id</arg>
<arg>--type</arg><arg>summary</arg>
</spark>
<ok to="indexScholix"/>
<error to="Kill"/>
</action>
<action name="indexScholix">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>index scholix</name>
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
<arg>--index</arg><arg>${index}_scholix</arg>
<arg>--idPath</arg><arg>identifier</arg>
<arg>--type</arg><arg>scholix</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,33 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the working path where generated files",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "index",
"paramDescription": "the index name",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "type",
"paramDescription": "should be scholix or summary",
"paramRequired": true
},
{
"paramName": "id",
"paramLongName": "idPath",
"paramDescription": "the identifier field name",
"paramRequired": true
}
]

View File

@ -0,0 +1,20 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirPath",
"paramDescription": "the working path where generated files",
"paramRequired": true
},
{
"paramName": "g",
"paramLongName": "graphPath",
"paramDescription": "the relationPath path ",
"paramRequired": true
}
]

View File

@ -0,0 +1,20 @@
[
{
"paramName": "mt",
"paramLongName": "master",
"paramDescription": "should be local or yarn",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirPath",
"paramDescription": "the working path where generated files",
"paramRequired": true
},
{
"paramName": "r",
"paramLongName": "relationPath",
"paramDescription": "the relationPath path ",
"paramRequired": true
}
]

View File

@ -0,0 +1,331 @@
{
"mappings": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"linkprovider": {
"type": "nested",
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "keyword"
}
}
},
"publicationDate": {
"type": "keyword"
},
"relationship": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"source": {
"type": "nested",
"properties": {
"collectedFrom": {
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"provider": {
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"provisionMode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"creator": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"dnetIdentifier": {
"type": "keyword"
},
"identifier": {
"type": "nested",
"properties": {
"identifier": {
"type": "keyword"
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"type": {
"type": "keyword"
}
}
},
"objectType": {
"type": "keyword"
},
"publicationDate": {
"type": "keyword"
},
"publisher": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"target": {
"type": "nested",
"properties": {
"collectedFrom": {
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"provider": {
"properties": {
"identifiers": {
"properties": {
"identifier": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"provisionMode": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"creator": {
"properties": {
"name": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"dnetIdentifier": {
"type": "keyword"
},
"identifier": {
"type": "nested",
"properties": {
"identifier": {
"type": "keyword"
},
"schema": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"type": {
"type": "keyword"
}
}
},
"objectType": {
"type": "keyword"
},
"publicationDate": {
"type": "keyword"
},
"publisher": {
"type": "nested",
"properties": {
"name": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
},
"settings": {
"index": {
"refresh_interval": "600s",
"number_of_shards": "48",
"translog": {
"sync_interval": "15s",
"durability": "ASYNC"
},
"analysis": {
"analyzer": {
"analyzer_keyword": {
"filter": "lowercase",
"tokenizer": "keyword"
}
}
},
"number_of_replicas": "0"
}
}
}

View File

@ -0,0 +1,132 @@
{
"mappings": {
"properties": {
"abstract": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"author": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"datasources": {
"type": "nested",
"properties": {
"completionStatus": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"datasourceId": {
"type": "keyword"
},
"datasourceName": {
"type": "keyword"
}
}
},
"date": {
"type": "keyword"
},
"id": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"localIdentifier": {
"type": "nested",
"properties": {
"id": {
"type": "keyword"
},
"type": {
"type": "keyword"
}
}
},
"publisher": {
"type": "keyword"
},
"relatedDatasets": {
"type": "long"
},
"relatedPublications": {
"type": "long"
},
"relatedUnknown": {
"type": "long"
},
"subject": {
"properties": {
"scheme": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"value": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"typology": {
"type": "keyword"
}
}
},
"settings": {
"index": {
"refresh_interval": "600s",
"number_of_shards": "48",
"translog": {
"sync_interval": "15s",
"durability": "ASYNC"
},
"analysis": {
"analyzer": {
"analyzer_keyword": {
"filter": "lowercase",
"tokenizer": "keyword"
}
}
},
"number_of_replicas": "0"
}
}
}

View File

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.scholix.Scholix;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
public class ExtractInfoTest {
@Test
public void testSerialization() throws Exception {
ScholixSummary summary = new ScholixSummary();
summary.setDescription("descrizione");
ObjectMapper mapper = new ObjectMapper();
String json = mapper.writeValueAsString(summary);
System.out.println(json);
System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription());
}
@Test
public void testScholix() throws Exception {
final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json"));
final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json"));
Scholix.generateScholixWithSource(jsonSummary, jsonRelation);
}
}

View File

@ -0,0 +1 @@
{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"references","subRelType":null,"relClass":"datacite","source":"50|f2123fce7e56c73dc8f1bf64ec59b477","target":"50|b618cbe39ba940a29993ac324e5f9621","collectedFrom":[{"key":"dli_________::datacite","value":"Datasets in Datacite","dataInfo":null}]}

View File

@ -0,0 +1 @@
{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"IsReferencedBy","subRelType":null,"relClass":"datacite","source":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","target":"60|97519e00ee2cddfa1f5bcb5220429b8f","collectedFrom":[{"key":"dli_________::europe_pmc__","value":"Europe PMC","dataInfo":null}]}

View File

@ -0,0 +1 @@
{"id":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","localIdentifier":[{"id":"16909284","type":"pbmid"},{"id":"10.1007/s00438-006-0155-3","type":"doi"}],"typology":"publication","title":["Effects of the Sabin-like mutations in domain V of the internal ribosome entry segment on translational efficiency of the Coxsackievirus B3.","Effects of the Sabin-like mutations in domain V of the internal ribosome entry segment on translational efficiency of the Coxsackievirus B3"],"author":["Ben Mhadheb-Gharbi Manel","Gharbi Jawhar","Paulous Sylvie","Brocard Michèle","Komaromva Anastasia","Aouni Mahjoub","M. Kean Katherine"],"date":[null,"2018-11-13","2006-08-14T15:43:22Z"],"subject":[],"publisher":null,"relatedPublications":1,"relatedDatasets":4,"relatedUnknown":0,"datasources":null,"abstract":"The domain V within the internal ribosome entry segment (IRES) of poliovirus (PV) is expected to be important in its own neurovirulence because it contains an attenuating mutation in each of the Sabin vaccine strains. In this study, we try to find out if the results observed in the case of Sabin vaccine strains of PV can be extrapolated to another virus belonging to the same genus of enteroviruses but with a different tropism. To test this hypothesis, we used the coxsackievirus B3 (CVB3), known to be the mo"}

View File

@ -19,6 +19,8 @@
<module>dhp-graph-mapper</module>
<module>dhp-dedup-openaire</module>
<module>dhp-graph-provision</module>
<module>dhp-dedup-scholexplorer</module>
<module>dhp-graph-provision-scholexplorer</module>
</modules>
<pluginRepositories>

16
pom.xml
View File

@ -354,7 +354,20 @@
<version>4.0</version>
</dependency>
<dependency>
<dependency>
<groupId>com.ximpleware</groupId>
<artifactId>vtd-xml</artifactId>
<version>${vtd.version}</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>7.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.oozie</groupId>
<artifactId>oozie-client</artifactId>
<version>${dhp.oozie.version}</version>
@ -523,5 +536,6 @@
<junit-jupiter.version>5.6.1</junit-jupiter.version>
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
</properties>
</project>