forked from D-Net/dnet-hadoop
implemented workfow for aggregation and generation of infospace graph
This commit is contained in:
parent
fa7504bf29
commit
19a80e4638
|
@ -42,6 +42,10 @@
|
|||
<groupId>com.rabbitmq</groupId>
|
||||
<artifactId>amqp-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ximpleware</groupId>
|
||||
<artifactId>vtd-xml</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
package eu.dnetlib.dhp.parser.utility;
|
||||
|
||||
public class VtdException extends Exception {
|
||||
|
||||
public VtdException(final Exception e) {
|
||||
super(e);
|
||||
}
|
||||
|
||||
public VtdException(final Throwable e) {
|
||||
super(e);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
package eu.dnetlib.dhp.parser.utility;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.VTDNav;
|
||||
|
||||
/**
|
||||
* Created by sandro on 9/29/16.
|
||||
*/
|
||||
public class VtdUtilityParser {
|
||||
|
||||
public static List<Node> getTextValuesWithAttributes(final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes)
|
||||
throws VtdException {
|
||||
final List<Node> results = new ArrayList<>();
|
||||
try {
|
||||
ap.selectXPath(xpath);
|
||||
|
||||
while (ap.evalXPath() != -1) {
|
||||
final Node currentNode = new Node();
|
||||
int t = vn.getText();
|
||||
if (t >= 0) {
|
||||
currentNode.setTextValue(vn.toNormalizedString(t));
|
||||
}
|
||||
currentNode.setAttributes(getAttributes(vn, attributes));
|
||||
results.add(currentNode);
|
||||
}
|
||||
return results;
|
||||
} catch (Exception e) {
|
||||
throw new VtdException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static Map<String, String> getAttributes(final VTDNav vn, final List<String> attributes) {
|
||||
final Map<String, String> currentAttributes = new HashMap<>();
|
||||
if (attributes != null) {
|
||||
|
||||
attributes.forEach(attributeKey -> {
|
||||
try {
|
||||
int attr = vn.getAttrVal(attributeKey);
|
||||
if (attr > -1) {
|
||||
currentAttributes.put(attributeKey, vn.toNormalizedString(attr));
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
return currentAttributes;
|
||||
}
|
||||
|
||||
public static List<String> getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) throws VtdException {
|
||||
List<String> results = new ArrayList<>();
|
||||
try {
|
||||
ap.selectXPath(xpath);
|
||||
while (ap.evalXPath() != -1) {
|
||||
int t = vn.getText();
|
||||
if (t > -1) results.add(vn.toNormalizedString(t));
|
||||
}
|
||||
return results;
|
||||
} catch (Exception e) {
|
||||
throw new VtdException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) throws VtdException {
|
||||
try {
|
||||
ap.selectXPath(xpath);
|
||||
while (ap.evalXPath() != -1) {
|
||||
int it = nav.getText();
|
||||
if (it > -1)
|
||||
return nav.toNormalizedString(it);
|
||||
}
|
||||
return null;
|
||||
} catch (Exception e) {
|
||||
throw new VtdException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static class Node {
|
||||
|
||||
private String textValue;
|
||||
|
||||
private Map<String, String> attributes;
|
||||
|
||||
public String getTextValue() {
|
||||
return textValue;
|
||||
}
|
||||
|
||||
public void setTextValue(final String textValue) {
|
||||
this.textValue = textValue;
|
||||
}
|
||||
|
||||
public Map<String, String> getAttributes() {
|
||||
return attributes;
|
||||
}
|
||||
|
||||
public void setAttributes(final Map<String, String> attributes) {
|
||||
this.attributes = attributes;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -32,6 +32,12 @@
|
|||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JacksonInject;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
@JsonIgnoreProperties({"blank"})
|
||||
public class KeyValue implements Serializable {
|
||||
|
||||
private String key;
|
||||
|
@ -36,10 +39,12 @@ public class KeyValue implements Serializable {
|
|||
this.dataInfo = dataInfo;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public String toComparableString() {
|
||||
return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(key) && StringUtils.isBlank(value);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
package eu.dnetlib.dhp.schema.scholexplorer;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class DLIDataset extends Dataset {
|
||||
|
||||
private List<ProvenaceInfo> dlicollectedfrom;
|
||||
|
||||
private String completionStatus;
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
|
||||
public List<ProvenaceInfo> getDlicollectedfrom() {
|
||||
return dlicollectedfrom;
|
||||
}
|
||||
|
||||
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
|
||||
this.dlicollectedfrom = dlicollectedfrom;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mergeFrom(OafEntity e) {
|
||||
super.mergeFrom(e);
|
||||
DLIDataset p = (DLIDataset) e;
|
||||
if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus))
|
||||
completionStatus = p.completionStatus;
|
||||
if ("complete".equalsIgnoreCase(p.completionStatus))
|
||||
completionStatus = "complete";
|
||||
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
|
||||
}
|
||||
|
||||
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
|
||||
Map<String, ProvenaceInfo> result = new HashMap<>();
|
||||
if (a != null)
|
||||
a.forEach(p -> {
|
||||
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
|
||||
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
|
||||
result.put(p.getId(), p);
|
||||
}
|
||||
|
||||
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
|
||||
result.put(p.getId(), p);
|
||||
});
|
||||
if (b != null)
|
||||
b.forEach(p -> {
|
||||
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
|
||||
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
|
||||
result.put(p.getId(), p);
|
||||
}
|
||||
|
||||
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
|
||||
result.put(p.getId(), p);
|
||||
});
|
||||
|
||||
return new ArrayList<>(result.values());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.dhp.schema.scholexplorer;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
public class DLIPublication extends Publication implements Serializable {
|
||||
private List<ProvenaceInfo> dlicollectedfrom;
|
||||
|
||||
private String completionStatus;
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
|
||||
public List<ProvenaceInfo> getDlicollectedfrom() {
|
||||
return dlicollectedfrom;
|
||||
}
|
||||
|
||||
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
|
||||
this.dlicollectedfrom = dlicollectedfrom;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mergeFrom(OafEntity e) {
|
||||
super.mergeFrom(e);
|
||||
DLIPublication p = (DLIPublication) e;
|
||||
if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus))
|
||||
completionStatus = p.completionStatus;
|
||||
if ("complete".equalsIgnoreCase(p.completionStatus))
|
||||
completionStatus = "complete";
|
||||
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
|
||||
}
|
||||
|
||||
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
|
||||
Map<String, ProvenaceInfo> result = new HashMap<>();
|
||||
if (a != null)
|
||||
a.forEach(p -> {
|
||||
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
|
||||
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
|
||||
result.put(p.getId(), p);
|
||||
}
|
||||
|
||||
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
|
||||
result.put(p.getId(), p);
|
||||
});
|
||||
if (b != null)
|
||||
b.forEach(p -> {
|
||||
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
|
||||
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
|
||||
result.put(p.getId(), p);
|
||||
}
|
||||
|
||||
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
|
||||
result.put(p.getId(), p);
|
||||
});
|
||||
|
||||
return new ArrayList<>(result.values());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
package eu.dnetlib.dhp.schema.scholexplorer;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class DLIUnknown extends Oaf implements Serializable {
|
||||
|
||||
private String id;
|
||||
|
||||
private List<StructuredProperty> pid;
|
||||
|
||||
private String dateofcollection;
|
||||
|
||||
private String dateoftransformation;
|
||||
|
||||
private List<ProvenaceInfo> dlicollectedfrom;
|
||||
|
||||
private String completionStatus = "incomplete";
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
|
||||
public List<ProvenaceInfo> getDlicollectedfrom() {
|
||||
return dlicollectedfrom;
|
||||
}
|
||||
|
||||
public void setDlicollectedfrom(List<ProvenaceInfo> dlicollectedfrom) {
|
||||
this.dlicollectedfrom = dlicollectedfrom;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
|
||||
public List<StructuredProperty> getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public void setPid(List<StructuredProperty> pid) {
|
||||
this.pid = pid;
|
||||
}
|
||||
|
||||
public String getDateofcollection() {
|
||||
return dateofcollection;
|
||||
}
|
||||
|
||||
public void setDateofcollection(String dateofcollection) {
|
||||
this.dateofcollection = dateofcollection;
|
||||
}
|
||||
|
||||
public String getDateoftransformation() {
|
||||
return dateoftransformation;
|
||||
}
|
||||
|
||||
public void setDateoftransformation(String dateoftransformation) {
|
||||
this.dateoftransformation = dateoftransformation;
|
||||
}
|
||||
|
||||
public void mergeFrom(DLIUnknown p) {
|
||||
if ("complete".equalsIgnoreCase(p.completionStatus))
|
||||
completionStatus = "complete";
|
||||
dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom());
|
||||
}
|
||||
|
||||
private List<ProvenaceInfo> mergeProvenance(final List<ProvenaceInfo> a, final List<ProvenaceInfo> b) {
|
||||
Map<String, ProvenaceInfo> result = new HashMap<>();
|
||||
if (a != null)
|
||||
a.forEach(p -> {
|
||||
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
|
||||
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
|
||||
result.put(p.getId(), p);
|
||||
}
|
||||
|
||||
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
|
||||
result.put(p.getId(), p);
|
||||
});
|
||||
if (b != null)
|
||||
b.forEach(p -> {
|
||||
if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) {
|
||||
if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) && StringUtils.isNotBlank(p.getCompletionStatus())) {
|
||||
result.put(p.getId(), p);
|
||||
}
|
||||
|
||||
} else if (p != null && p.getId() != null && !result.containsKey(p.getId()))
|
||||
result.put(p.getId(), p);
|
||||
});
|
||||
|
||||
return new ArrayList<>(result.values());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package eu.dnetlib.dhp.schema.scholexplorer;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class ProvenaceInfo implements Serializable {
|
||||
|
||||
private String id;
|
||||
|
||||
private String name;
|
||||
|
||||
private String completionStatus;
|
||||
|
||||
private String collectionMode ="collected";
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getCompletionStatus() {
|
||||
return completionStatus;
|
||||
}
|
||||
|
||||
public void setCompletionStatus(String completionStatus) {
|
||||
this.completionStatus = completionStatus;
|
||||
}
|
||||
|
||||
public String getCollectionMode() {
|
||||
return collectionMode;
|
||||
}
|
||||
|
||||
public void setCollectionMode(String collectionMode) {
|
||||
this.collectionMode = collectionMode;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package eu.dnetlib.dhp.schema.scholexplorer;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.SerializationFeature;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
public class DLItest {
|
||||
|
||||
|
||||
@Test
|
||||
public void testMergePublication() throws JsonProcessingException {
|
||||
DLIPublication a1 = new DLIPublication();
|
||||
a1.setPid(Arrays.asList( createSP("123456","pdb","dnet:pid_types")));
|
||||
a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle")));
|
||||
a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd","Zenodo","complete")));
|
||||
a1.setCompletionStatus("complete");
|
||||
|
||||
DLIPublication a = new DLIPublication();
|
||||
a.setPid(Arrays.asList(createSP("10.11","doi","dnet:pid_types"), createSP("123456","pdb","dnet:pid_types")));
|
||||
a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle")));
|
||||
a.setDlicollectedfrom(Arrays.asList(createCollectedFrom("dct","datacite","complete"),createCollectedFrom("dct","datacite","incomplete")));
|
||||
a.setCompletionStatus("incomplete");
|
||||
|
||||
a.mergeFrom(a1);
|
||||
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
System.out.println(mapper.writeValueAsString(a));
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void testDeserialization() throws IOException {
|
||||
|
||||
final String json ="{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}";
|
||||
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class);
|
||||
mapper.enable(SerializationFeature.INDENT_OUTPUT);
|
||||
System.out.println(mapper.writeValueAsString(dliDataset));
|
||||
}
|
||||
|
||||
private ProvenaceInfo createCollectedFrom(final String id, final String name, final String completionStatus) {
|
||||
ProvenaceInfo p = new ProvenaceInfo();
|
||||
p.setId(id);
|
||||
p.setName(name);
|
||||
p.setCompletionStatus(completionStatus);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
private StructuredProperty createSP(final String value, final String className, final String schemeName) {
|
||||
StructuredProperty p = new StructuredProperty();
|
||||
p.setValue(value);
|
||||
Qualifier schema = new Qualifier();
|
||||
schema.setClassname(className);
|
||||
schema.setClassid(className);
|
||||
schema.setSchemename(schemeName);
|
||||
schema.setSchemeid(schemeName);
|
||||
p.setQualifier(schema);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -31,10 +31,6 @@
|
|||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.arakelian</groupId>
|
||||
<artifactId>java-jq</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
|
|
|
@ -30,6 +30,10 @@
|
|||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -0,0 +1,101 @@
|
|||
package eu.dnetlib.dhp.graph.scholexplorer;
|
||||
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.graph.SparkGraphImporterJob;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import net.minidev.json.JSONArray;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
public class SparkExtractEntitiesJob {
|
||||
final static String IDJSONPATH = "$.id";
|
||||
final static String SOURCEJSONPATH = "$.source";
|
||||
final static String TARGETJSONPATH = "$.target";
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkExtractEntitiesJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_extract_entities_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkGraphImporterJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
final String tdir =parser.get("targetDir");
|
||||
final JavaRDD<String> inputRDD = sc.textFile(inputPath);
|
||||
|
||||
List<String> entities = Arrays.stream(parser.get("entities").split(",")).map(String::trim).collect(Collectors.toList());
|
||||
if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) {
|
||||
//Extract Dataset
|
||||
inputRDD.filter(SparkExtractEntitiesJob::isDataset).saveAsTextFile(targetPath + "/dataset/"+tdir, GzipCodec.class);
|
||||
}
|
||||
if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) {
|
||||
//Extract Unknown
|
||||
inputRDD.filter(SparkExtractEntitiesJob::isUnknown).saveAsTextFile(targetPath + "/unknown/"+tdir, GzipCodec.class);
|
||||
}
|
||||
|
||||
if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) {
|
||||
//Extract Relation
|
||||
inputRDD.filter(SparkExtractEntitiesJob::isRelation).saveAsTextFile(targetPath + "/relation/"+tdir, GzipCodec.class);
|
||||
}
|
||||
if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) {
|
||||
//Extract Relation
|
||||
inputRDD.filter(SparkExtractEntitiesJob::isPublication).saveAsTextFile(targetPath + "/publication/"+tdir, GzipCodec.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean isDataset(final String json) {
|
||||
final String id = getJPathString(IDJSONPATH, json);
|
||||
if (StringUtils.isBlank(id)) return false;
|
||||
return id.startsWith("60|");
|
||||
}
|
||||
|
||||
|
||||
public static boolean isPublication(final String json) {
|
||||
final String id = getJPathString(IDJSONPATH, json);
|
||||
if (StringUtils.isBlank(id)) return false;
|
||||
return id.startsWith("50|");
|
||||
}
|
||||
|
||||
public static boolean isUnknown(final String json) {
|
||||
final String id = getJPathString(IDJSONPATH, json);
|
||||
if (StringUtils.isBlank(id)) return false;
|
||||
return id.startsWith("70|");
|
||||
}
|
||||
|
||||
public static boolean isRelation(final String json) {
|
||||
final String source = getJPathString(SOURCEJSONPATH, json);
|
||||
final String target = getJPathString(TARGETJSONPATH, json);
|
||||
return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target);
|
||||
}
|
||||
|
||||
|
||||
public static String getJPathString(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof String)
|
||||
return (String) o;
|
||||
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
|
||||
return (String) ((JSONArray) o).get(0);
|
||||
return "";
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package eu.dnetlib.dhp.graph.scholexplorer;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.graph.SparkGraphImporterJob;
|
||||
import eu.dnetlib.dhp.graph.scholexplorer.parser.DatasetScholexplorerParser;
|
||||
import eu.dnetlib.dhp.graph.scholexplorer.parser.PublicationScholexplorerParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkScholexplorerGraphImporter {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkScholexplorerGraphImporter.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_scholix_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkGraphImporterJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
|
||||
sc.sequenceFile(inputPath, IntWritable.class, Text.class).map(Tuple2::_2).map(Text::toString).repartition(500)
|
||||
.flatMap((FlatMapFunction<String, Oaf>) record -> {
|
||||
switch (parser.get("entity")) {
|
||||
case "dataset":
|
||||
final DatasetScholexplorerParser d = new DatasetScholexplorerParser();
|
||||
return d.parseObject(record).iterator();
|
||||
case "publication":
|
||||
final PublicationScholexplorerParser p = new PublicationScholexplorerParser();
|
||||
return p.parseObject(record).iterator();
|
||||
default:
|
||||
throw new IllegalArgumentException("wrong values of entities");
|
||||
}
|
||||
}).map(k -> {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
return mapper.writeValueAsString(k);
|
||||
}).saveAsTextFile(parser.get("targetPath"), GzipCodec.class);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,138 @@
|
|||
package eu.dnetlib.dhp.graph.scholexplorer;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.graph.SparkGraphImporterJob;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import net.minidev.json.JSONArray;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function2;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SparkScholexplorerMergeEntitiesJob {
|
||||
|
||||
final static String IDJSONPATH = "$.id";
|
||||
final static String SOURCEJSONPATH = "$.source";
|
||||
final static String TARGETJSONPATH = "$.target";
|
||||
final static String RELJSONPATH = "$.relType";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkScholexplorerMergeEntitiesJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/merge_entities_scholix_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkGraphImporterJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
final String entity = parser.get("entity");
|
||||
|
||||
|
||||
FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration());
|
||||
List<Path> subFolder = Arrays.stream(fs.listStatus(new Path(inputPath))).filter(FileStatus::isDirectory).map(FileStatus::getPath).collect(Collectors.toList());
|
||||
List<JavaRDD<String>> inputRdd = new ArrayList<>();
|
||||
subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath())));
|
||||
JavaRDD<String> union = sc.emptyRDD();
|
||||
for (JavaRDD<String> item : inputRdd) {
|
||||
union = union.union(item);
|
||||
}
|
||||
switch (entity) {
|
||||
case "dataset":
|
||||
union.mapToPair((PairFunction<String, String, DLIDataset>) f -> {
|
||||
final String id = getJPathString(IDJSONPATH, f);
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class));
|
||||
}).reduceByKey((a, b) -> {
|
||||
a.mergeFrom(b);
|
||||
return a;
|
||||
}).map(item -> {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
return mapper.writeValueAsString(item._2());
|
||||
}).saveAsTextFile(targetPath, GzipCodec.class);
|
||||
break;
|
||||
case "publication":
|
||||
union.mapToPair((PairFunction<String, String, DLIPublication>) f -> {
|
||||
final String id = getJPathString(IDJSONPATH, f);
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class));
|
||||
}).reduceByKey((a, b) -> {
|
||||
a.mergeFrom(b);
|
||||
return a;
|
||||
}).map(item -> {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
return mapper.writeValueAsString(item._2());
|
||||
}).saveAsTextFile(targetPath, GzipCodec.class);
|
||||
break;
|
||||
case "unknown":
|
||||
union.mapToPair((PairFunction<String, String, DLIUnknown>) f -> {
|
||||
final String id = getJPathString(IDJSONPATH, f);
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class));
|
||||
}).reduceByKey((a, b) -> {
|
||||
a.mergeFrom(b);
|
||||
return a;
|
||||
}).map(item -> {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
return mapper.writeValueAsString(item._2());
|
||||
}).saveAsTextFile(targetPath, GzipCodec.class);
|
||||
break;
|
||||
case "relation":
|
||||
union.mapToPair((PairFunction<String, String, Relation>) f -> {
|
||||
final String source = getJPathString(SOURCEJSONPATH, f);
|
||||
final String target = getJPathString(TARGETJSONPATH, f);
|
||||
final String reltype = getJPathString(RELJSONPATH, f);
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
return new Tuple2<>(DHPUtils.md5(String.format("%s::%s::%s", source, reltype, target)), mapper.readValue(f, Relation.class));
|
||||
}).reduceByKey((a, b) -> {
|
||||
a.mergeOAFDataInfo(b);
|
||||
return a;
|
||||
}).map(item -> {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
return mapper.writeValueAsString(item._2());
|
||||
}).saveAsTextFile(targetPath, GzipCodec.class);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
public static String getJPathString(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof String)
|
||||
return (String) o;
|
||||
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
|
||||
return (String) ((JSONArray) o).get(0);
|
||||
return "";
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,112 @@
|
|||
package eu.dnetlib.dhp.graph.scholexplorer.parser;
|
||||
|
||||
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import javax.xml.stream.XMLStreamReader;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public abstract class AbstractScholexplorerParser {
|
||||
|
||||
protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class);
|
||||
final static Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE);
|
||||
private List<String> datasetSubTypes = Arrays.asList("dataset", "software", "film", "sound", "physicalobject", "audiovisual", "collection", "other", "study", "metadata");
|
||||
|
||||
public abstract List<Oaf> parseObject(final String record);
|
||||
|
||||
protected Map<String, String> getAttributes(final XMLStreamReader parser) {
|
||||
final Map<String, String> attributesMap = new HashMap<>();
|
||||
for (int i = 0; i < parser.getAttributeCount(); i++) {
|
||||
attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
|
||||
}
|
||||
return attributesMap;
|
||||
}
|
||||
|
||||
|
||||
protected List<StructuredProperty> extractSubject(List<VtdUtilityParser.Node> subjects) {
|
||||
final List<StructuredProperty> subjectResult = new ArrayList<>();
|
||||
if (subjects != null && subjects.size() > 0) {
|
||||
subjects.forEach(subjectMap -> {
|
||||
final StructuredProperty subject = new StructuredProperty();
|
||||
subject.setValue(subjectMap.getTextValue());
|
||||
final Qualifier schema = new Qualifier();
|
||||
schema.setClassid("dnet:subject");
|
||||
schema.setClassname("dnet:subject");
|
||||
schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme"));
|
||||
schema.setSchemename(subjectMap.getAttributes().get("subjectScheme"));
|
||||
subject.setQualifier(schema);
|
||||
subjectResult.add(subject);
|
||||
});
|
||||
}
|
||||
return subjectResult;
|
||||
}
|
||||
|
||||
|
||||
protected StructuredProperty extractIdentifier(List<VtdUtilityParser.Node> identifierType, final String fieldName) {
|
||||
final StructuredProperty pid = new StructuredProperty();
|
||||
if (identifierType != null && identifierType.size() > 0) {
|
||||
final VtdUtilityParser.Node result = identifierType.get(0);
|
||||
pid.setValue(result.getTextValue());
|
||||
final Qualifier pidType = new Qualifier();
|
||||
pidType.setClassname(result.getAttributes().get(fieldName));
|
||||
pidType.setClassid(result.getAttributes().get(fieldName));
|
||||
pidType.setSchemename("dnet:pid_types");
|
||||
pidType.setSchemeid("dnet:pid_types");
|
||||
pid.setQualifier(pidType);
|
||||
return pid;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected void inferPid(final StructuredProperty input) {
|
||||
final Matcher matcher = pattern.matcher(input.getValue());
|
||||
if (matcher.find()) {
|
||||
input.setValue(matcher.group());
|
||||
if (input.getQualifier() == null) {
|
||||
input.setQualifier(new Qualifier());
|
||||
input.getQualifier().setSchemename("dnet:pid_types");
|
||||
input.getQualifier().setSchemeid("dnet:pid_types");
|
||||
}
|
||||
input.getQualifier().setClassid("doi");
|
||||
input.getQualifier().setClassname("doi");
|
||||
}
|
||||
}
|
||||
|
||||
protected String generateId(final String pid, final String pidType, final String entityType) {
|
||||
String type = "50|";
|
||||
switch (entityType){
|
||||
case "publication":
|
||||
type = "50|";
|
||||
break;
|
||||
case "dataset":
|
||||
type = "60|";
|
||||
break;
|
||||
case "unknown":
|
||||
type = "70|";
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("unexpected value "+entityType);
|
||||
|
||||
}
|
||||
if ("dnet".equalsIgnoreCase(pidType))
|
||||
return type+StringUtils.substringAfter(pid, "::");
|
||||
|
||||
return type+ DHPUtils.md5(String.format("%s::%s", pid, pidType));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,263 @@
|
|||
package eu.dnetlib.dhp.graph.scholexplorer.parser;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.VTDGen;
|
||||
import com.ximpleware.VTDNav;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
|
||||
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class DatasetScholexplorerParser extends AbstractScholexplorerParser {
|
||||
@Override
|
||||
public List<Oaf> parseObject(String record) {
|
||||
try {
|
||||
final DLIDataset parsedObject = new DLIDataset();
|
||||
final VTDGen vg = new VTDGen();
|
||||
vg.setDoc(record.getBytes());
|
||||
final List<Oaf> result = new ArrayList<>();
|
||||
vg.parse(true);
|
||||
|
||||
final VTDNav vn = vg.getNav();
|
||||
final AutoPilot ap = new AutoPilot(vn);
|
||||
|
||||
DataInfo di = new DataInfo();
|
||||
di.setTrust("0.9");
|
||||
di.setDeletedbyinference(false);
|
||||
di.setInvisible(false);
|
||||
parsedObject.setDataInfo(di);
|
||||
|
||||
|
||||
final String objIdentifier = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']");
|
||||
parsedObject.setId("60|" + StringUtils.substringAfter(objIdentifier, "::"));
|
||||
|
||||
parsedObject.setOriginalId(Collections.singletonList(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
|
||||
|
||||
|
||||
parsedObject.setDateofcollection(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"));
|
||||
|
||||
final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
|
||||
|
||||
if (StringUtils.isNotBlank(resolvedDate)) {
|
||||
StructuredProperty currentDate = new StructuredProperty();
|
||||
currentDate.setValue(resolvedDate);
|
||||
final Qualifier dateQualifier = new Qualifier();
|
||||
dateQualifier.setClassname("resolvedDate");
|
||||
dateQualifier.setClassid("resolvedDate");
|
||||
dateQualifier.setSchemename("dnet::date");
|
||||
dateQualifier.setSchemeid("dnet::date");
|
||||
currentDate.setQualifier(dateQualifier);
|
||||
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
|
||||
}
|
||||
|
||||
final String completionStatus = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']");
|
||||
final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
|
||||
|
||||
final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']");
|
||||
|
||||
List<VtdUtilityParser.Node> collectedFromNodes =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='collectedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
|
||||
|
||||
List<VtdUtilityParser.Node> resolvededFromNodes =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resolvedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
|
||||
|
||||
Field<String> pf = new Field<>();
|
||||
pf.setValue(publisher);
|
||||
|
||||
parsedObject.setPublisher(pf);
|
||||
final List<ProvenaceInfo> provenances = new ArrayList<>();
|
||||
if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
|
||||
collectedFromNodes.forEach(it -> {
|
||||
final ProvenaceInfo provenance = new ProvenaceInfo();
|
||||
provenance.setId(it.getAttributes().get("id"));
|
||||
provenance.setName(it.getAttributes().get("name"));
|
||||
provenance.setCollectionMode(provisionMode);
|
||||
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
|
||||
provenances.add(provenance);
|
||||
});
|
||||
}
|
||||
|
||||
if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
|
||||
resolvededFromNodes.forEach(it -> {
|
||||
final ProvenaceInfo provenance = new ProvenaceInfo();
|
||||
provenance.setId(it.getAttributes().get("id"));
|
||||
provenance.setName(it.getAttributes().get("name"));
|
||||
provenance.setCollectionMode("resolved");
|
||||
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
|
||||
provenances.add(provenance);
|
||||
});
|
||||
}
|
||||
|
||||
parsedObject.setDlicollectedfrom(provenances);
|
||||
parsedObject.setCollectedfrom(parsedObject.getDlicollectedfrom().stream().map(
|
||||
p-> {
|
||||
final KeyValue cf = new KeyValue();
|
||||
cf.setKey(p.getId());
|
||||
cf.setValue(p.getName());
|
||||
return cf;
|
||||
}
|
||||
).collect(Collectors.toList()));
|
||||
parsedObject.setCompletionStatus(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
|
||||
|
||||
final List<Node> identifierType =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resource']/*[local-name()='identifier']", Collections.singletonList("identifierType"));
|
||||
|
||||
StructuredProperty currentPid = extractIdentifier(identifierType, "type");
|
||||
if (currentPid == null) return null;
|
||||
inferPid(currentPid);
|
||||
parsedObject.setPid(Collections.singletonList(currentPid));
|
||||
|
||||
|
||||
List<String> descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']");
|
||||
if (descs != null && descs.size() > 0)
|
||||
parsedObject.setDescription(descs.stream()
|
||||
.map(it -> it.length() < 512 ? it : it.substring(0, 512))
|
||||
.map(it -> {
|
||||
final Field<String> d = new Field<>();
|
||||
d.setValue(it);
|
||||
return d;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
|
||||
final List<Node> relatedIdentifiers =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='relatedIdentifier']",
|
||||
Arrays.asList("relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
|
||||
|
||||
|
||||
if(relatedIdentifiers!= null) {
|
||||
result.addAll(relatedIdentifiers.stream()
|
||||
.flatMap(n -> {
|
||||
final List<Relation> rels = new ArrayList<>();
|
||||
Relation r = new Relation();
|
||||
r.setSource(parsedObject.getId());
|
||||
final String relatedPid = n.getTextValue();
|
||||
final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
|
||||
final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
|
||||
final String relationSemantic = n.getAttributes().get("relationType");
|
||||
final String inverseRelation = n.getAttributes().get("inverseRelationType");
|
||||
final String targetId = generateId(relatedPid, relatedPidType, relatedType);
|
||||
r.setTarget(targetId);
|
||||
r.setRelType(relationSemantic);
|
||||
r.setCollectedFrom(parsedObject.getCollectedfrom());
|
||||
rels.add(r);
|
||||
r = new Relation();
|
||||
r.setSource(targetId);
|
||||
r.setTarget(parsedObject.getId());
|
||||
r.setRelType(inverseRelation);
|
||||
r.setCollectedFrom(parsedObject.getCollectedfrom());
|
||||
rels.add(r);
|
||||
result.add(createUnknownObject(relatedPid, relatedPidType, parsedObject.getCollectedfrom().get(0), di));
|
||||
return rels.stream();
|
||||
}).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
||||
final List<Node> hostedBy =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
|
||||
|
||||
|
||||
if (hostedBy != null) {
|
||||
parsedObject.setInstance(hostedBy.stream().map(it ->
|
||||
{
|
||||
final Instance i = new Instance();
|
||||
i.setUrl(Collections.singletonList(currentPid.getValue()));
|
||||
KeyValue h = new KeyValue();
|
||||
i.setHostedby(h);
|
||||
h.setKey(it.getAttributes().get("id"));
|
||||
h.setValue(it.getAttributes().get("name"));
|
||||
return i;
|
||||
}).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
||||
List<StructuredProperty> subjects = extractSubject(VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resource']//*[local-name()='subject']", Arrays.asList("subjectScheme")));
|
||||
|
||||
parsedObject.setSubject(subjects);
|
||||
|
||||
parsedObject.setCompletionStatus(completionStatus);
|
||||
|
||||
final List<String> creators = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']");
|
||||
if (creators != null && creators.size() > 0) {
|
||||
parsedObject.setAuthor(creators
|
||||
.stream()
|
||||
.map(a -> {
|
||||
final Author author = new Author();
|
||||
author.setFullname(a);
|
||||
return author;
|
||||
}).collect(Collectors.toList())
|
||||
);
|
||||
}
|
||||
final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']//*[local-name()='title']");
|
||||
if (titles != null && titles.size() > 0) {
|
||||
parsedObject.setTitle(titles.stream()
|
||||
.map(t -> {
|
||||
final StructuredProperty st = new StructuredProperty();
|
||||
st.setValue(t);
|
||||
return st;
|
||||
}
|
||||
).collect(Collectors.toList())
|
||||
);
|
||||
}
|
||||
|
||||
final List<String> dates = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']");
|
||||
|
||||
|
||||
if (dates != null && dates.size() > 0) {
|
||||
parsedObject.setRelevantdate(dates.stream().map(
|
||||
cd -> {
|
||||
StructuredProperty date = new StructuredProperty();
|
||||
date.setValue(cd);
|
||||
final Qualifier dq = new Qualifier();
|
||||
dq.setClassname("date");
|
||||
dq.setClassid("date");
|
||||
dq.setSchemename("dnet::date");
|
||||
dq.setSchemeid("dnet::date");
|
||||
date.setQualifier(dq);
|
||||
return date;
|
||||
}
|
||||
).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
result.add(parsedObject);
|
||||
return result;
|
||||
} catch (Throwable e) {
|
||||
log.error("Error on parsing record " + record, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private DLIUnknown createUnknownObject(final String pid, final String pidType, final KeyValue cf, final DataInfo di) {
|
||||
final DLIUnknown uk = new DLIUnknown();
|
||||
uk.setId(generateId(pid, pidType, "unknown"));
|
||||
ProvenaceInfo pi = new ProvenaceInfo();
|
||||
pi.setId(cf.getKey());
|
||||
pi.setName(cf.getValue());
|
||||
pi.setCompletionStatus("incomplete");
|
||||
uk.setDataInfo(di);
|
||||
uk.setDlicollectedfrom(Collections.singletonList(pi));
|
||||
final StructuredProperty sourcePid = new StructuredProperty();
|
||||
sourcePid.setValue(pid);
|
||||
final Qualifier pt = new Qualifier();
|
||||
pt.setClassname(pidType);
|
||||
pt.setClassid(pidType);
|
||||
pt.setSchemename("dnet:pid_types");
|
||||
pt.setSchemeid("dnet:pid_types");
|
||||
sourcePid.setQualifier(pt);
|
||||
uk.setPid(Collections.singletonList(sourcePid));
|
||||
return uk;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,233 @@
|
|||
package eu.dnetlib.dhp.graph.scholexplorer.parser;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.VTDGen;
|
||||
import com.ximpleware.VTDNav;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PublicationScholexplorerParser extends AbstractScholexplorerParser {
|
||||
|
||||
@Override
|
||||
public List<Oaf> parseObject(final String record) {
|
||||
try {
|
||||
final List<Oaf> result = new ArrayList<>();
|
||||
final DLIPublication parsedObject = new DLIPublication();
|
||||
final VTDGen vg = new VTDGen();
|
||||
vg.setDoc(record.getBytes());
|
||||
vg.parse(true);
|
||||
|
||||
|
||||
final VTDNav vn = vg.getNav();
|
||||
final AutoPilot ap = new AutoPilot(vn);
|
||||
|
||||
final DataInfo di = new DataInfo();
|
||||
di.setTrust("0.9");
|
||||
di.setDeletedbyinference(false);
|
||||
di.setInvisible(false);
|
||||
|
||||
final String objIdentifier = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']");
|
||||
parsedObject.setId("50|" + StringUtils.substringAfter(objIdentifier, "::"));
|
||||
|
||||
parsedObject.setDateofcollection(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"));
|
||||
|
||||
final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']");
|
||||
parsedObject.setOriginalId(Collections.singletonList(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']")));
|
||||
|
||||
if (StringUtils.isNotBlank(resolvedDate)) {
|
||||
StructuredProperty currentDate = new StructuredProperty();
|
||||
currentDate.setValue(resolvedDate);
|
||||
final Qualifier dateQualifier = new Qualifier();
|
||||
dateQualifier.setClassname("resolvedDate");
|
||||
dateQualifier.setClassid("resolvedDate");
|
||||
dateQualifier.setSchemename("dnet::date");
|
||||
dateQualifier.setSchemeid("dnet::date");
|
||||
currentDate.setQualifier(dateQualifier);
|
||||
parsedObject.setRelevantdate(Collections.singletonList(currentDate));
|
||||
}
|
||||
|
||||
|
||||
final List<Node> pid = VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='pid']", Arrays.asList("type"));
|
||||
|
||||
StructuredProperty currentPid = extractIdentifier(pid, "type");
|
||||
if (currentPid == null) return null;
|
||||
inferPid(currentPid);
|
||||
parsedObject.setPid(Collections.singletonList(currentPid));
|
||||
|
||||
String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']");
|
||||
|
||||
List<Node> collectedFromNodes =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='collectedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
|
||||
|
||||
List<Node> resolvededFromNodes =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='resolvedFrom']", Arrays.asList("name", "id", "mode", "completionStatus"));
|
||||
|
||||
final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']");
|
||||
Field<String> pf = new Field<>();
|
||||
pf.setValue(publisher);
|
||||
|
||||
parsedObject.setPublisher(pf);
|
||||
final List<ProvenaceInfo> provenances = new ArrayList<>();
|
||||
if (collectedFromNodes != null && collectedFromNodes.size() > 0) {
|
||||
collectedFromNodes.forEach(it -> {
|
||||
final ProvenaceInfo provenance = new ProvenaceInfo();
|
||||
provenance.setId(it.getAttributes().get("id"));
|
||||
provenance.setName(it.getAttributes().get("name"));
|
||||
provenance.setCollectionMode(provisionMode);
|
||||
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
|
||||
provenances.add(provenance);
|
||||
});
|
||||
}
|
||||
|
||||
if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) {
|
||||
resolvededFromNodes.forEach(it -> {
|
||||
final ProvenaceInfo provenance = new ProvenaceInfo();
|
||||
provenance.setId(it.getAttributes().get("id"));
|
||||
provenance.setName(it.getAttributes().get("name"));
|
||||
provenance.setCollectionMode("resolved");
|
||||
provenance.setCompletionStatus(it.getAttributes().get("completionStatus"));
|
||||
provenances.add(provenance);
|
||||
});
|
||||
}
|
||||
|
||||
parsedObject.setDlicollectedfrom(provenances);
|
||||
parsedObject.setCompletionStatus(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"));
|
||||
|
||||
parsedObject.setCollectedfrom(parsedObject.getDlicollectedfrom().stream().map(
|
||||
p -> {
|
||||
final KeyValue cf = new KeyValue();
|
||||
cf.setKey(p.getId());
|
||||
cf.setValue(p.getName());
|
||||
return cf;
|
||||
}
|
||||
).collect(Collectors.toList()));
|
||||
|
||||
final List<Node> relatedIdentifiers =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='relatedIdentifier']",
|
||||
Arrays.asList("relatedIdentifierType", "relationType", "entityType", "inverseRelationType"));
|
||||
|
||||
|
||||
if (relatedIdentifiers != null) {
|
||||
result.addAll(relatedIdentifiers.stream()
|
||||
.flatMap(n -> {
|
||||
final List<Relation> rels = new ArrayList<>();
|
||||
Relation r = new Relation();
|
||||
r.setSource(parsedObject.getId());
|
||||
final String relatedPid = n.getTextValue();
|
||||
final String relatedPidType = n.getAttributes().get("relatedIdentifierType");
|
||||
final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown");
|
||||
final String relationSemantic = n.getAttributes().get("relationType");
|
||||
final String inverseRelation = n.getAttributes().get("inverseRelationType");
|
||||
final String targetId = generateId(relatedPid, relatedPidType, relatedType);
|
||||
r.setTarget(targetId);
|
||||
r.setRelType(relationSemantic);
|
||||
r.setCollectedFrom(parsedObject.getCollectedfrom());
|
||||
r.setRelClass("datacite");
|
||||
r.setDataInfo(di);
|
||||
rels.add(r);
|
||||
r = new Relation();
|
||||
r.setSource(targetId);
|
||||
r.setTarget(parsedObject.getId());
|
||||
r.setRelType(inverseRelation);
|
||||
r.setCollectedFrom(parsedObject.getCollectedfrom());
|
||||
r.setDataInfo(di);
|
||||
r.setRelClass("datacite");
|
||||
rels.add(r);
|
||||
|
||||
return rels.stream();
|
||||
}).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
final List<Node> hostedBy =
|
||||
VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name"));
|
||||
|
||||
|
||||
if (hostedBy != null) {
|
||||
parsedObject.setInstance(hostedBy.stream().map(it ->
|
||||
{
|
||||
final Instance i = new Instance();
|
||||
i.setUrl(Collections.singletonList(currentPid.getValue()));
|
||||
KeyValue h = new KeyValue();
|
||||
i.setHostedby(h);
|
||||
h.setKey(it.getAttributes().get("id"));
|
||||
h.setValue(it.getAttributes().get("name"));
|
||||
return i;
|
||||
}).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
final List<String> authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']");
|
||||
if (authorsNode != null)
|
||||
parsedObject.setAuthor(authorsNode
|
||||
.stream()
|
||||
.map(a -> {
|
||||
final Author author = new Author();
|
||||
author.setFullname(a);
|
||||
return author;
|
||||
}).collect(Collectors.toList())
|
||||
);
|
||||
|
||||
final List<String> titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']");
|
||||
if (titles != null) {
|
||||
parsedObject.setTitle(titles.stream()
|
||||
.map(t -> {
|
||||
final StructuredProperty st = new StructuredProperty();
|
||||
st.setValue(t);
|
||||
return st;
|
||||
}
|
||||
).collect(Collectors.toList())
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Field<String> description = new Field<>();
|
||||
|
||||
description.setValue(VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']"));
|
||||
|
||||
if (StringUtils.isNotBlank(description.getValue()) && description.getValue().length() > 512) {
|
||||
description.setValue(description.getValue().substring(0, 512));
|
||||
}
|
||||
|
||||
parsedObject.setDescription(Collections.singletonList(description));
|
||||
|
||||
|
||||
final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']");
|
||||
|
||||
StructuredProperty date = new StructuredProperty();
|
||||
date.setValue(cd);
|
||||
final Qualifier dq = new Qualifier();
|
||||
dq.setClassname("date");
|
||||
dq.setClassid("date");
|
||||
dq.setSchemename("dnet::date");
|
||||
dq.setSchemeid("dnet::date");
|
||||
date.setQualifier(dq);
|
||||
parsedObject.setRelevantdate(Collections.singletonList(date));
|
||||
|
||||
List<StructuredProperty> subjects = extractSubject(VtdUtilityParser.getTextValuesWithAttributes(ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme")));
|
||||
parsedObject.setSubject(subjects);
|
||||
|
||||
parsedObject.setDataInfo(di);
|
||||
|
||||
|
||||
result.add(parsedObject);
|
||||
return result;
|
||||
|
||||
} catch (Throwable e) {
|
||||
log.error("Input record: " + record);
|
||||
log.error("Error on parsing record ", e);
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the result data", "paramRequired": true},
|
||||
{"paramName":"td", "paramLongName":"targetDir", "paramDescription": "the name of the result data", "paramRequired": true},
|
||||
{"paramName":"e", "paramLongName":"entities", "paramDescription": "the entity type to be filtered", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,6 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the result data", "paramRequired": true},
|
||||
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the entity type", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,6 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the entity type", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the result data", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,10 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,64 @@
|
|||
<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetDir</name>
|
||||
<description>the name of the path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>entity</name>
|
||||
<description>the entity to be merged</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="MapGraphIntoDataFrame"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="MapGraphIntoDataFrame">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge ${entity}</name>
|
||||
<class>eu.dnetlib.dhp.graph.scholexplorer.SparkScholexplorerMergeEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--num-executors 100
|
||||
--conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--entity</arg><arg>${entity}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,10 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,68 @@
|
|||
<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetDir</name>
|
||||
<description>the name of the path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>entities</name>
|
||||
<description>the entities to be extracted</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="MapGraphIntoDataFrame"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="MapGraphIntoDataFrame">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extract ${entities}</name>
|
||||
<class>eu.dnetlib.dhp.graph.scholexplorer.SparkExtractEntitiesJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--num-executors 100
|
||||
<!-- --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"-->
|
||||
<!-- --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"-->
|
||||
<!-- --conf spark.sql.warehouse.dir="/user/hive/warehouse"-->
|
||||
--conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--targetDir</arg><arg>${targetDir}</arg>
|
||||
<arg>--entities</arg><arg>${entities}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,10 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,63 @@
|
|||
<workflow-app name="import_infospace_graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>entity</name>
|
||||
<description>the entity type</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="MapGraphIntoDataFrame"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="MapGraphIntoDataFrame">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Import ${entity} and related entities</name>
|
||||
<class>eu.dnetlib.dhp.graph.scholexplorer.SparkScholexplorerGraphImporter</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--num-executors 100
|
||||
<!-- --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"-->
|
||||
<!-- --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"-->
|
||||
<!-- --conf spark.sql.warehouse.dir="/user/hive/warehouse"-->
|
||||
--conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2"
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--entity</arg><arg>${entity}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,19 @@
|
|||
package eu.dnetlib.dhp.graph.scholexplorer;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class SparkScholexplorerGraphImporterTest {
|
||||
|
||||
@Test
|
||||
|
||||
public void testImport() throws Exception {
|
||||
SparkScholexplorerGraphImporter.main(new String[]{
|
||||
"-mt", "local[*]",
|
||||
"-e", "publication",
|
||||
"-s", "file:///data/scholexplorer_dump/pmf.dli.seq",
|
||||
"-t", "file:///data/scholexplorer_dump/pmf_dli_with_rel"}
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
}
|
6
pom.xml
6
pom.xml
|
@ -231,6 +231,11 @@
|
|||
<artifactId>secondstring</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ximpleware</groupId>
|
||||
<artifactId>vtd-xml</artifactId>
|
||||
<version>${vtd.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.oozie</groupId>
|
||||
|
@ -421,6 +426,7 @@
|
|||
<dhp.jackson.version>2.9.6</dhp.jackson.version>
|
||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||
<scala.version>2.11.12</scala.version>
|
||||
<vtd.version>[2.12,3.0)</vtd.version>
|
||||
</properties>
|
||||
</project>
|
||||
|
||||
|
|
Loading…
Reference in New Issue