Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi

This commit is contained in:
Enrico Ottonello 2021-05-20 18:33:18 +02:00
commit 0821d8e97d
29 changed files with 1456 additions and 41 deletions

View File

@ -22,7 +22,7 @@ import org.apache.poi.xssf.usermodel.XSSFWorkbook;
*/ */
public class EXCELParser { public class EXCELParser {
public <R> List<R> parse(InputStream file, String classForName) public <R> List<R> parse(InputStream file, String classForName, String sheetName)
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
InvalidFormatException { InvalidFormatException {
@ -30,7 +30,11 @@ public class EXCELParser {
OPCPackage pkg = OPCPackage.open(file); OPCPackage pkg = OPCPackage.open(file);
XSSFWorkbook wb = new XSSFWorkbook(pkg); XSSFWorkbook wb = new XSSFWorkbook(pkg);
XSSFSheet sheet = wb.getSheet("cordisref-H2020topics"); XSSFSheet sheet = wb.getSheet(sheetName);
if (sheetName == null) {
throw new RuntimeException("Sheet name " + sheetName + " not present in current file");
}
List<R> ret = new ArrayList<>(); List<R> ret = new ArrayList<>();
@ -49,7 +53,7 @@ public class EXCELParser {
headers.add(dataFormatter.formatCellValue(cell)); headers.add(dataFormatter.formatCellValue(cell));
} }
} else { } else {
Class<?> clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic"); Class<?> clazz = Class.forName(classForName);
final Object cc = clazz.newInstance(); final Object cc = clazz.newInstance();
for (int i = 0; i < headers.size(); i++) { for (int i = 0; i < headers.size(); i++) {

View File

@ -42,19 +42,20 @@ public class ReadExcel implements Closeable {
final String hdfsPath = parser.get("hdfsPath"); final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("hdfsNameNode"); final String hdfsNameNode = parser.get("hdfsNameNode");
final String classForName = parser.get("classForName"); final String classForName = parser.get("classForName");
final String sheetName = parser.get("sheetName");
try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) { try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) {
log.info("Getting Excel file..."); log.info("Getting Excel file...");
readExcel.execute(classForName); readExcel.execute(classForName, sheetName);
} }
} }
public void execute(final String classForName) throws Exception { public void execute(final String classForName, final String sheetName) throws Exception {
EXCELParser excelParser = new EXCELParser(); EXCELParser excelParser = new EXCELParser();
excelParser excelParser
.parse(excelFile, classForName) .parse(excelFile, classForName, sheetName)
.stream() .stream()
.forEach(p -> write(p)); .forEach(p -> write(p));

View File

@ -0,0 +1,215 @@
package eu.dnetlib.dhp.actionmanager.ror;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class GenerateRorActionSetJob {
private static final Logger log = LoggerFactory.getLogger(GenerateRorActionSetJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ROR_NS_PREFIX = "ror_________";
private static final List<KeyValue> ROR_COLLECTED_FROM = listKeyValues(
"10|openaire____::993a7ae7a863813cf95028b50708e222", "ROR");
private static final DataInfo ROR_DATA_INFO = dataInfo(
false, "", false, false, ENTITYREGISTRY_PROVENANCE_ACTION, "0.92");
private static final Qualifier ROR_PID_TYPE = qualifier(
"ROR", "ROR", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES);
public static void main(final String[] args) throws Exception {
final String jsonConfiguration = IOUtils
.toString(
SparkAtomicActionJob.class
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, outputPath);
processRorOrganizations(spark, inputPath, outputPath);
});
}
private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
private static void processRorOrganizations(final SparkSession spark,
final String inputPath,
final String outputPath) throws Exception {
readInputPath(spark, inputPath)
.map(
(MapFunction<RorOrganization, Organization>) GenerateRorActionSetJob::convertRorOrg,
Encoders.bean(Organization.class))
.toJavaRDD()
.map(o -> new AtomicAction<>(Organization.class, o))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
protected static Organization convertRorOrg(final RorOrganization r) {
final Date now = new Date();
final Organization o = new Organization();
o.setId(String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(r.getId())));
o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId())));
o.setCollectedfrom(ROR_COLLECTED_FROM);
o.setPid(pids(r));
o.setDateofcollection(now.toString());
o.setDateoftransformation(now.toString());
o.setExtraInfo(new ArrayList<>()); // Values not present in the file
o.setOaiprovenance(null); // Values not present in the file
o.setLegalshortname(field(r.getAcronyms().stream().findFirst().orElse(r.getName()), ROR_DATA_INFO));
o.setLegalname(field(r.getName(), ROR_DATA_INFO));
o.setAlternativeNames(alternativeNames(r));
o.setWebsiteurl(field(r.getLinks().stream().findFirst().orElse(null), ROR_DATA_INFO));
o.setLogourl(null);
o.setEclegalbody(null);
o.setEclegalperson(null);
o.setEcnonprofit(null);
o.setEcresearchorganization(null);
o.setEchighereducation(null);
o.setEcinternationalorganizationeurinterests(null);
o.setEcinternationalorganization(null);
o.setEcenterprise(null);
o.setEcsmevalidated(null);
o.setEcnutscode(null);
if (r.getCountry() != null) {
o
.setCountry(
qualifier(
r.getCountry().getCountryCode(), r
.getCountry()
.getCountryName(),
ModelConstants.DNET_COUNTRY_TYPE, ModelConstants.DNET_COUNTRY_TYPE));
} else {
o.setCountry(null);
}
o.setDataInfo(ROR_DATA_INFO);
o.setLastupdatetimestamp(now.getTime());
return o;
}
private static List<StructuredProperty> pids(final RorOrganization r) {
final List<StructuredProperty> pids = new ArrayList<>();
pids.add(structuredProperty(r.getId(), ROR_PID_TYPE, ROR_DATA_INFO));
for (final Map.Entry<String, ExternalIdType> e : r.getExternalIds().entrySet()) {
final String type = e.getKey();
final List<String> all = e.getValue().getAll();
if (all != null) {
final Qualifier qualifier = qualifier(
type, type, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES);
for (final String pid : all) {
pids
.add(structuredProperty(pid, qualifier, ROR_DATA_INFO));
}
}
}
return pids;
}
private static List<Field<String>> alternativeNames(final RorOrganization r) {
final Set<String> names = new LinkedHashSet<>();
names.addAll(r.getAliases());
names.addAll(r.getAcronyms());
r.getLabels().forEach(l -> names.add(l.getLabel()));
return names
.stream()
.filter(StringUtils::isNotBlank)
.map(s -> field(s, ROR_DATA_INFO))
.collect(Collectors.toList());
}
private static Dataset<RorOrganization> readInputPath(
final SparkSession spark,
final String path) throws Exception {
try (final FileSystem fileSystem = FileSystem.get(new Configuration());
final InputStream is = fileSystem.open(new Path(path))) {
final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class);
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class));
}
}
}

View File

@ -0,0 +1,122 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class Address implements Serializable {
@JsonProperty("lat")
private Float lat;
@JsonProperty("state_code")
private String stateCode;
@JsonProperty("country_geonames_id")
private Integer countryGeonamesId;
@JsonProperty("lng")
private Float lng;
@JsonProperty("state")
private String state;
@JsonProperty("city")
private String city;
@JsonProperty("geonames_city")
private GeonamesCity geonamesCity;
@JsonProperty("postcode")
private String postcode;
@JsonProperty("primary")
private Boolean primary;
@JsonProperty("line")
private String line;
private final static long serialVersionUID = 2444635485253443195L;
public Float getLat() {
return lat;
}
public void setLat(final Float lat) {
this.lat = lat;
}
public String getStateCode() {
return stateCode;
}
public void setStateCode(final String stateCode) {
this.stateCode = stateCode;
}
public Integer getCountryGeonamesId() {
return countryGeonamesId;
}
public void setCountryGeonamesId(final Integer countryGeonamesId) {
this.countryGeonamesId = countryGeonamesId;
}
public Float getLng() {
return lng;
}
public void setLng(final Float lng) {
this.lng = lng;
}
public String getState() {
return state;
}
public void setState(final String state) {
this.state = state;
}
public String getCity() {
return city;
}
public void setCity(final String city) {
this.city = city;
}
public GeonamesCity getGeonamesCity() {
return geonamesCity;
}
public void setGeonamesCity(final GeonamesCity geonamesCity) {
this.geonamesCity = geonamesCity;
}
public String getPostcode() {
return postcode;
}
public void setPostcode(final String postcode) {
this.postcode = postcode;
}
public Boolean getPrimary() {
return primary;
}
public void setPrimary(final Boolean primary) {
this.primary = primary;
}
public String getLine() {
return line;
}
public void setLine(final String line) {
this.line = line;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class Country implements Serializable {
@JsonProperty("country_code")
private String countryCode;
@JsonProperty("country_name")
private String countryName;
private final static long serialVersionUID = 4357848706229493627L;
public String getCountryCode() {
return countryCode;
}
public void setCountryCode(final String countryCode) {
this.countryCode = countryCode;
}
public String getCountryName() {
return countryName;
}
public void setCountryName(final String countryName) {
this.countryName = countryName;
}
}

View File

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import java.util.List;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
@JsonDeserialize(using = ExternalIdTypeDeserializer.class)
public class ExternalIdType implements Serializable {
private List<String> all;
private String preferred;
private final static long serialVersionUID = 2616688352998387611L;
public ExternalIdType() {
}
public ExternalIdType(final List<String> all, final String preferred) {
this.all = all;
this.preferred = preferred;
}
public List<String> getAll() {
return all;
}
public void setAll(final List<String> all) {
this.all = all;
}
public String getPreferred() {
return preferred;
}
public void setPreferred(final String preferred) {
this.preferred = preferred;
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.ObjectCodec;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
public class ExternalIdTypeDeserializer extends JsonDeserializer<ExternalIdType> {
@Override
public ExternalIdType deserialize(final JsonParser p, final DeserializationContext ctxt)
throws IOException, JsonProcessingException {
final ObjectCodec oc = p.getCodec();
final JsonNode node = oc.readTree(p);
final JsonNode allNode = node.get("all");
final String preferred = node.get("preferred").asText();
final List<String> all = new ArrayList<>();
if (allNode.isArray()) {
allNode.elements().forEachRemaining(x -> all.add(x.asText()));
} else {
all.add(allNode.asText());
}
return new ExternalIdType(all, preferred);
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class GeonamesAdmin implements Serializable {
@JsonProperty("ascii_name")
private String asciiName;
@JsonProperty("id")
private Integer id;
@JsonProperty("name")
private String name;
@JsonProperty("code")
private String code;
private final static long serialVersionUID = 7294958526269195673L;
public String getAsciiName() {
return asciiName;
}
public void setAsciiName(final String asciiName) {
this.asciiName = asciiName;
}
public Integer getId() {
return id;
}
public void setId(final Integer id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public String getCode() {
return code;
}
public void setCode(final String code) {
this.code = code;
}
}

View File

@ -0,0 +1,100 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class GeonamesCity implements Serializable {
@JsonProperty("geonames_admin1")
private GeonamesAdmin geonamesAdmin1;
@JsonProperty("geonames_admin2")
private GeonamesAdmin geonamesAdmin2;
@JsonProperty("city")
private String city;
@JsonProperty("id")
private Integer id;
@JsonProperty("nuts_level1")
private NameAndCode nutsLevel1;
@JsonProperty("nuts_level2")
private NameAndCode nutsLevel2;
@JsonProperty("nuts_level3")
private NameAndCode nutsLevel3;
@JsonProperty("license")
private License license;
private final static long serialVersionUID = -8389480201526252955L;
public NameAndCode getNutsLevel2() {
return nutsLevel2;
}
public void setNutsLevel2(final NameAndCode nutsLevel2) {
this.nutsLevel2 = nutsLevel2;
}
public GeonamesAdmin getGeonamesAdmin2() {
return geonamesAdmin2;
}
public void setGeonamesAdmin2(final GeonamesAdmin geonamesAdmin2) {
this.geonamesAdmin2 = geonamesAdmin2;
}
public GeonamesAdmin getGeonamesAdmin1() {
return geonamesAdmin1;
}
public void setGeonamesAdmin1(final GeonamesAdmin geonamesAdmin1) {
this.geonamesAdmin1 = geonamesAdmin1;
}
public String getCity() {
return city;
}
public void setCity(final String city) {
this.city = city;
}
public Integer getId() {
return id;
}
public void setId(final Integer id) {
this.id = id;
}
public NameAndCode getNutsLevel1() {
return nutsLevel1;
}
public void setNutsLevel1(final NameAndCode nutsLevel1) {
this.nutsLevel1 = nutsLevel1;
}
public NameAndCode getNutsLevel3() {
return nutsLevel3;
}
public void setNutsLevel3(final NameAndCode nutsLevel3) {
this.nutsLevel3 = nutsLevel3;
}
public License getLicense() {
return license;
}
public void setLicense(final License license) {
this.license = license;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class Label implements Serializable {
@JsonProperty("iso639")
private String iso639;
@JsonProperty("label")
private String label;
private final static long serialVersionUID = -6576156103297850809L;
public String getIso639() {
return iso639;
}
public void setIso639(final String iso639) {
this.iso639 = iso639;
}
public String getLabel() {
return label;
}
public void setLabel(final String label) {
this.label = label;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class License implements Serializable {
@JsonProperty("attribution")
private String attribution;
@JsonProperty("license")
private String license;
private final static long serialVersionUID = -194308261058176439L;
public String getAttribution() {
return attribution;
}
public void setAttribution(final String attribution) {
this.attribution = attribution;
}
public String getLicense() {
return license;
}
public void setLicense(final String license) {
this.license = license;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class NameAndCode implements Serializable {
@JsonProperty("name")
private String name;
@JsonProperty("code")
private String code;
private final static long serialVersionUID = 5459836979206140843L;
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public String getCode() {
return code;
}
public void setCode(final String code) {
this.code = code;
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonProperty;
public class Relationship implements Serializable {
@JsonProperty("type")
private String type;
@JsonProperty("id")
private String id;
@JsonProperty("label")
private String label;
private final static long serialVersionUID = 7847399503395576960L;
public String getType() {
return type;
}
public void setType(final String type) {
this.type = type;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getLabel() {
return label;
}
public void setLabel(final String label) {
this.label = label;
}
}

View File

@ -0,0 +1,192 @@
package eu.dnetlib.dhp.actionmanager.ror.model;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonProperty;
public class RorOrganization implements Serializable {
@JsonProperty("ip_addresses")
private List<String> ipAddresses = new ArrayList<>();
@JsonProperty("aliases")
private List<String> aliases = new ArrayList<>();
@JsonProperty("acronyms")
private List<String> acronyms = new ArrayList<>();
@JsonProperty("links")
private List<String> links = new ArrayList<>();
@JsonProperty("country")
private Country country;
@JsonProperty("name")
private String name;
@JsonProperty("wikipedia_url")
private String wikipediaUrl;
@JsonProperty("addresses")
private List<Address> addresses = new ArrayList<>();
@JsonProperty("types")
private List<String> types = new ArrayList<>();
@JsonProperty("established")
private Integer established;
@JsonProperty("relationships")
private List<Relationship> relationships = new ArrayList<>();
@JsonProperty("email_address")
private String emailAddress;
@JsonProperty("external_ids")
private Map<String, ExternalIdType> externalIds = new LinkedHashMap<>();
@JsonProperty("id")
private String id;
@JsonProperty("labels")
private List<Label> labels = new ArrayList<>();
@JsonProperty("status")
private String status;
private final static long serialVersionUID = -2658312087616043225L;
public List<String> getIpAddresses() {
return ipAddresses;
}
public void setIpAddresses(final List<String> ipAddresses) {
this.ipAddresses = ipAddresses;
}
public List<String> getAliases() {
return aliases;
}
public void setAliases(final List<String> aliases) {
this.aliases = aliases;
}
public List<String> getAcronyms() {
return acronyms;
}
public void setAcronyms(final List<String> acronyms) {
this.acronyms = acronyms;
}
public List<String> getLinks() {
return links;
}
public void setLinks(final List<String> links) {
this.links = links;
}
public Country getCountry() {
return country;
}
public void setCountry(final Country country) {
this.country = country;
}
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public String getWikipediaUrl() {
return wikipediaUrl;
}
public void setWikipediaUrl(final String wikipediaUrl) {
this.wikipediaUrl = wikipediaUrl;
}
public List<Address> getAddresses() {
return addresses;
}
public void setAddresses(final List<Address> addresses) {
this.addresses = addresses;
}
public List<String> getTypes() {
return types;
}
public void setTypes(final List<String> types) {
this.types = types;
}
public Integer getEstablished() {
return established;
}
public void setEstablished(final Integer established) {
this.established = established;
}
public List<Relationship> getRelationships() {
return relationships;
}
public void setRelationships(final List<Relationship> relationships) {
this.relationships = relationships;
}
public String getEmailAddress() {
return emailAddress;
}
public void setEmailAddress(final String emailAddress) {
this.emailAddress = emailAddress;
}
public Map<String, ExternalIdType> getExternalIds() {
return externalIds;
}
public void setExternalIds(final Map<String, ExternalIdType> externalIds) {
this.externalIds = externalIds;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public List<Label> getLabels() {
return labels;
}
public void setLabels(final List<Label> labels) {
this.labels = labels;
}
public String getStatus() {
return status;
}
public void setStatus(final String status) {
this.status = status;
}
}

View File

@ -65,6 +65,7 @@
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg> <arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--fileURL</arg><arg>${topicFileURL}</arg> <arg>--fileURL</arg><arg>${topicFileURL}</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/topic</arg> <arg>--hdfsPath</arg><arg>${workingDir}/topic</arg>
<arg>--sheetName</arg><arg>${sheetName}</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic</arg> <arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic</arg>
</java> </java>
<ok to="read_projects"/> <ok to="read_projects"/>

View File

@ -23,6 +23,11 @@
"paramLongName" : "classForName", "paramLongName" : "classForName",
"paramDescription" : "the name of the class to deserialize the csv to", "paramDescription" : "the name of the class to deserialize the csv to",
"paramRequired" : true "paramRequired" : true
}, {
"paramName": "sn",
"paramLongName" : "sheetName",
"paramDescription" : "the name of the sheet in case the file is excel",
"paramRequired" : false
} }

View File

@ -0,0 +1,14 @@
[
{
"paramName": "i",
"paramLongName": "inputPath",
"paramDescription": "the path of the input json",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,58 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration>

View File

@ -0,0 +1,55 @@
<workflow-app name="Update_ROR_action_set" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>rorJsonInputPath</name>
<description>the path of the json</description>
</property>
<property>
<name>rorActionSetPath</name>
<description>path where to store the action set</description>
</property>
</parameters>
<start to="deleteoutputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="deleteoutputpath">
<fs>
<delete path="${rorActionSetPath}"/>
<mkdir path="${rorActionSetPath}"/>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="processRorFile"/>
<error to="Kill"/>
</action>
<action name="processRorFile">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>ProcessRorFile</name>
<class>eu.dnetlib.dhp.actionmanager.ror.GenerateRorActionSetJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputPath</arg><arg>${rorJsonInputPath}</arg>
<arg>--outputPath</arg><arg>${rorActionSetPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -35,8 +35,9 @@ public class EXCELParserTest {
EXCELParser excelParser = new EXCELParser(); EXCELParser excelParser = new EXCELParser();
List<Object> pl = excelParser final String classForName = "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic";
.parse(httpConnector.getInputSourceAsStream(URL), "eu.dnetlib.dhp.actionmanager.project.utils.ExcelTopic"); final String sheetName = "Topics";
List<Object> pl = excelParser.parse(httpConnector.getInputSourceAsStream(URL), classForName, sheetName);
Assertions.assertEquals(3837, pl.size()); Assertions.assertEquals(3837, pl.size());

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.actionmanager.ror;
import java.io.FileInputStream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
import eu.dnetlib.dhp.schema.oaf.Organization;
@Disabled
class GenerateRorActionSetJobTest {
private static final ObjectMapper mapper = new ObjectMapper();
private static final String local_file_path = "/Users/michele/Downloads/ror-data-2021-04-06.json";
@BeforeEach
void setUp() throws Exception {
}
@Test
void testConvertRorOrg() throws Exception {
final RorOrganization r = mapper
.readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class);
final Organization org = GenerateRorActionSetJob.convertRorOrg(r);
System.out.println(mapper.writeValueAsString(org));
}
@Test
void testConvertAllRorOrg() throws Exception {
final RorOrganization[] arr = mapper
.readValue(IOUtils.toString(new FileInputStream(local_file_path)), RorOrganization[].class);
for (final RorOrganization r : arr) {
GenerateRorActionSetJob.convertRorOrg(r);
}
}
}

View File

@ -0,0 +1,123 @@
{
"ip_addresses": [],
"aliases": [],
"acronyms": [
"ANU"
],
"links": [
"http://www.anu.edu.au/"
],
"country": {
"country_code": "AU",
"country_name": "Australia"
},
"name": "Australian National University",
"wikipedia_url": "http://en.wikipedia.org/wiki/Australian_National_University",
"addresses": [
{
"lat": -35.2778,
"state_code": "AU-ACT",
"country_geonames_id": 2077456,
"lng": 149.1205,
"state": "Australian Capital Territory",
"city": "Canberra",
"geonames_city": {
"nuts_level2": {
"name": null,
"code": null
},
"geonames_admin2": {
"ascii_name": null,
"id": null,
"name": null,
"code": null
},
"geonames_admin1": {
"ascii_name": "ACT",
"id": 2177478,
"name": "ACT",
"code": "AU.01"
},
"city": "Canberra",
"id": 2172517,
"nuts_level1": {
"name": null,
"code": null
},
"nuts_level3": {
"name": null,
"code": null
},
"license": {
"attribution": "Data from geonames.org under a CC-BY 3.0 license",
"license": "http://creativecommons.org/licenses/by/3.0/"
}
},
"postcode": null,
"primary": false,
"line": null
}
],
"types": [
"Education"
],
"established": 1946,
"relationships": [
{
"type": "Related",
"id": "https://ror.org/041c7s516",
"label": "Calvary Hospital"
},
{
"type": "Related",
"id": "https://ror.org/04h7nbn38",
"label": "Canberra Hospital"
},
{
"type": "Related",
"id": "https://ror.org/030jpqj15",
"label": "Goulburn Base Hospital"
},
{
"type": "Child",
"id": "https://ror.org/006a4jj40",
"label": "Mount Stromlo Observatory"
}
],
"email_address": null,
"external_ids": {
"Wikidata": {
"all": [
"Q127990"
],
"preferred": null
},
"OrgRef": {
"all": [
"285106"
],
"preferred": null
},
"ISNI": {
"all": [
"0000 0001 2180 7477"
],
"preferred": null
},
"FundRef": {
"all": [
"501100000995",
"501100001151",
"100009020"
],
"preferred": "501100000995"
},
"GRID": {
"all": "grid.1001.0",
"preferred": "grid.1001.0"
}
},
"id": "https://ror.org/019wvm592",
"labels": [],
"status": "active"
}

View File

@ -4,6 +4,11 @@ package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -51,6 +56,11 @@ public class PrepareResultInstRepoAssociation {
final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
List<String> blacklist = Optional
.ofNullable(parser.get("blacklist"))
.map(v -> Arrays.asList(v.split(";")))
.orElse(new ArrayList<>());
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
@ -61,7 +71,7 @@ public class PrepareResultInstRepoAssociation {
readNeededResources(spark, inputPath); readNeededResources(spark, inputPath);
removeOutputDir(spark, datasourceOrganizationPath); removeOutputDir(spark, datasourceOrganizationPath);
prepareDatasourceOrganization(spark, datasourceOrganizationPath); prepareDatasourceOrganization(spark, datasourceOrganizationPath, blacklist);
removeOutputDir(spark, alreadyLinkedPath); removeOutputDir(spark, alreadyLinkedPath);
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
@ -80,7 +90,14 @@ public class PrepareResultInstRepoAssociation {
} }
private static void prepareDatasourceOrganization( private static void prepareDatasourceOrganization(
SparkSession spark, String datasourceOrganizationPath) { SparkSession spark, String datasourceOrganizationPath, List<String> blacklist) {
String blacklisted = "";
if (blacklist.size() > 0) {
blacklisted = " AND d.id != '" + blacklist.get(0) + "'";
for (int i = 1; i < blacklist.size(); i++) {
blacklisted += " AND d.id != '" + blacklist.get(i) + "'";
}
}
String query = "SELECT source datasourceId, target organizationId " String query = "SELECT source datasourceId, target organizationId "
+ "FROM ( SELECT id " + "FROM ( SELECT id "
@ -88,7 +105,7 @@ public class PrepareResultInstRepoAssociation {
+ "WHERE datasourcetype.classid = '" + "WHERE datasourcetype.classid = '"
+ INSTITUTIONAL_REPO_TYPE + INSTITUTIONAL_REPO_TYPE
+ "' " + "' "
+ "AND datainfo.deletedbyinference = false ) d " + "AND datainfo.deletedbyinference = false " + blacklisted + " ) d "
+ "JOIN ( SELECT source, target " + "JOIN ( SELECT source, target "
+ "FROM relation " + "FROM relation "
+ "WHERE lower(relclass) = '" + "WHERE lower(relclass) = '"

View File

@ -28,5 +28,10 @@
"paramLongName": "isSparkSessionManaged", "paramLongName": "isSparkSessionManaged",
"paramDescription": "the path where prepared info have been stored", "paramDescription": "the path where prepared info have been stored",
"paramRequired": false "paramRequired": false
} },{
"paramName": "bl",
"paramLongName": "blacklist",
"paramDescription": "institutional repositories that should not be considered for the propagation",
"paramRequired": false
}
] ]

View File

@ -141,6 +141,7 @@
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--blacklist</arg><arg>${blacklist}</arg>
</spark> </spark>
<ok to="fork_join_apply_resulttoorganization_propagation"/> <ok to="fork_join_apply_resulttoorganization_propagation"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -41,7 +41,9 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
return prepareListStructProps( return prepareListStructProps(
doc, "//*[local-name()='titles']/*[local-name()='title']", MAIN_TITLE_QUALIFIER, info); doc,
"//*[local-name()='titles']/*[local-name()='title']|//*[local-name()='resource']/*[local-name()='title']",
MAIN_TITLE_QUALIFIER, info);
} }
@Override @Override

View File

@ -246,6 +246,51 @@ public class MappersTest {
assertEquals(r2.getValidationDate(), "2020-01-01"); assertEquals(r2.getValidationDate(), "2020-01-01");
} }
@Test
void testOdfBielefeld() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_bielefeld.xml"));
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Publication);
final Publication p = (Publication) list.get(0);
assertValidId(p.getId());
assertTrue(p.getOriginalId().size() == 1);
assertEquals("oai:pub.uni-bielefeld.de:2949739", p.getOriginalId().get(0));
assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(p.getAuthor().size() > 0);
final Optional<Author> author = p
.getAuthor()
.stream()
.findFirst();
assertTrue(author.isPresent());
assertEquals("Potwarka, Luke R.", author.get().getFullname());
assertEquals("Potwarka", author.get().getSurname());
assertEquals("Luke R.", author.get().getName());
assertTrue(p.getSubject().size() > 0);
assertTrue(p.getInstance().size() > 0);
assertNotNull(p.getTitle());
assertFalse(p.getTitle().isEmpty());
assertNotNull(p.getInstance());
assertTrue(p.getInstance().size() > 0);
p
.getInstance()
.stream()
.forEach(i -> {
assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid());
});
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
}
@Test @Test
void testOpentrial() throws IOException { void testOpentrial() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_opentrial.xml")); final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_opentrial.xml"));

View File

@ -0,0 +1,90 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<oai:header xmlns="http://namespace.openaire.eu/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<dri:objIdentifier>od______2294::3319684c321620512ddebd65f0e44dcf</dri:objIdentifier>
<dri:recordIdentifier>oai:pub.uni-bielefeld.de:2949739</dri:recordIdentifier>
<dri:dateOfCollection>2021-05-12T23:41:23.636Z</dri:dateOfCollection>
<oaf:datasourceprefix>od______2294</oaf:datasourceprefix>
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">oai:pub.uni-bielefeld.de:2949739</identifier>
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2021-02-01T11:07:07Z</datestamp>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">journal_article</setSpec>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">doc-type:article</setSpec>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">ddc:796</setSpec>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">journal_articleFtxt</setSpec>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">open_access</setSpec>
<dr:dateOfTransformation>2021-05-12T23:51:25.559Z</dr:dateOfTransformation>
</oai:header>
<metadata>
<datacite:resource>
<datacite:identifier identifierType="DOI">10.3390/su13010069</datacite:identifier>
<datacite:alternateIdentifiers>
<datacite:alternateIdentifier alternateIdentifierType="URN">urn:nbn:de:0070-pub-29497390</datacite:alternateIdentifier>
<datacite:alternateIdentifier alternateIdentifierType="URL">https://pub.uni-bielefeld.de/record/2949739</datacite:alternateIdentifier>
</datacite:alternateIdentifiers>
<datacite:relatedIdentifiers>
<datacite:relatedIdentifier relatedIdentifierType="URL" relationType="HasMetadata">https://pub.uni-bielefeld.de/record/2949739.json</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="EISSN" relationType="isPartOf">2071-1050</datacite:relatedIdentifier>
</datacite:relatedIdentifiers>
<datacite:description descriptionType="Abstract">Policy makers often legitimize bids for major sport events and public funding of elite sports by trickle-down effects, suggesting that hosting events, sporting success, and athlete role models inspire the population to participate themselves in sport and physical activity. According to previous review articles, empirical evidence of trickle-down effects are mixed, with several studies citing marginal or no effect. The purpose of this study is to apply a realist synthesis approach to evaluate under which conditions trickle-down effects occur (i.e., what works for whom under which circumstances?). Using rapid evidence assessment methodology, 58 empirical articles were identified in the search process and critically analyzed through the lens of realist synthesis evaluation. The analysis identified six conditions under which trickle-down effects have occurred: Event leveraging initiatives, capacity of community sport to cater for new participants, live spectating experiences, consumption possibilities on television or other media, and communities housing event venues. The findings have implications for the sustainability of sport policy decisions and public finance, as the likelihood of trickle-down effects increases with integrated planning and sustainable spending related to the above six conditions.
</datacite:description>
<datacite:language>eng</datacite:language>
<datacite:publisher>MDPI </datacite:publisher>
<datacite:format>application/pdf</datacite:format>
<datacite:title xml:lang="eng">Conditions under Which Trickle-Down Effects Occur: A Realist Synthesis Approach</datacite:title>
<datacite:creators>
<datacite:creator>
<datacite:creatorName nameType="Personal">Potwarka, Luke R.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName nameType="Personal">Wicker, Pamela</datacite:creatorName>
</datacite:creator>
</datacite:creators>
<datacite:date dateType="Issued">2021</datacite:date>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
<datacite:subjects>
<datacite:subject schemeURI="http://dewey.info/" subjectScheme="dewey">796</datacite:subject>
<datacite:subject>demonstration effect</datacite:subject>
<datacite:subject>sport participation legacy</datacite:subject>
<datacite:subject>sport event</datacite:subject>
<datacite:subject>evaluation</datacite:subject>
</datacite:subjects>
<datacite:sizes>
<datacite:size>11 B</datacite:size>
</datacite:sizes>
</datacite:resource>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:dateAccepted>2021-01-01</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>https://creativecommons.org/licenses/by/4.0/</oaf:license>
<oaf:language>eng</oaf:language>
<oaf:hostedBy id="opendoar____::2294" name="Publications at Bielefeld University"/>
<oaf:collectedFrom id="opendoar____::2294" name="Publications at Bielefeld University"/>
<oaf:fulltext>https://pub.uni-bielefeld.de/download/2949739/2949794</oaf:fulltext>
</metadata>
<about xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2021-05-12T23:41:23.636Z">
<baseURL>http%3A%2F%2Fpub.uni-bielefeld.de%2Foai</baseURL>
<identifier>oai:pub.uni-bielefeld.de:2949739</identifier>
<datestamp>2021-02-01T11:07:07Z</datestamp>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
classname="sysimport:crosswalk:datasetarchive"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -6,6 +6,7 @@ import java.time.LocalDateTime
import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatter
import eu.dnetlib.dhp.common.PacePerson import eu.dnetlib.dhp.common.PacePerson
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
@ -43,18 +44,18 @@ object DLIToOAF {
val relationTypeMapping: Map[String, (String, String)] = Map( val relationTypeMapping: Map[String, (String, String)] = Map(
"IsReferencedBy" -> ("isRelatedTo", "relationship"), "IsReferencedBy" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"References" -> ("isRelatedTo", "relationship"), "References" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"IsRelatedTo" -> ("isRelatedTo", "relationship"), "IsRelatedTo" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"IsSupplementedBy" -> ("isSupplementedBy", "supplement"), "IsSupplementedBy" -> (ModelConstants.IS_SUPPLEMENTED_BY, ModelConstants.SUPPLEMENT),
"Documents"-> ("isRelatedTo", "relationship"), "Documents"-> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"Cites" -> ("cites", "citation"), "Cites" -> (ModelConstants.CITES, ModelConstants.CITATION),
"Unknown" -> ("isRelatedTo", "relationship"), "Unknown" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"IsSourceOf" -> ("isRelatedTo", "relationship"), "IsSourceOf" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"IsCitedBy" -> ("IsCitedBy", "citation"), "IsCitedBy" -> (ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
"Reviews" -> ("reviews", "review"), "Reviews" -> (ModelConstants.REVIEWS, ModelConstants.REVIEW),
"Describes" -> ("isRelatedTo", "relationship"), "Describes" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP),
"HasAssociationWith" -> ("isRelatedTo", "relationship") "HasAssociationWith" -> (ModelConstants.IS_RELATED_TO, ModelConstants.RELATIONSHIP)
) )
val expectecdPidType = List("uniprot", "ena", "chembl", "ncbi-n", "ncbi-p", "genbank", "pdb", "url") val expectecdPidType = List("uniprot", "ena", "chembl", "ncbi-n", "ncbi-p", "genbank", "pdb", "url")
@ -83,11 +84,11 @@ object DLIToOAF {
val rel_inverse: Map[String, String] = Map( val rel_inverse: Map[String, String] = Map(
"isRelatedTo" -> "isRelatedTo", ModelConstants.IS_RELATED_TO -> ModelConstants.IS_RELATED_TO,
"isSupplementedBy" -> "isSupplementTo", ModelConstants.IS_SUPPLEMENTED_BY -> ModelConstants.IS_SUPPLEMENT_TO,
"cites" -> "IsCitedBy", ModelConstants.CITES -> ModelConstants.IS_CITED_BY,
"IsCitedBy" -> "cites", ModelConstants.IS_CITED_BY -> ModelConstants.CITES,
"reviews" -> "IsReviewedBy" ModelConstants.REVIEWS -> ModelConstants.IS_REVIEWED_BY
) )
@ -158,7 +159,7 @@ object DLIToOAF {
result.setUrl(e.url) result.setUrl(e.url)
result.setRefidentifier(e.pid) result.setRefidentifier(e.pid)
result.setDataInfo(generateDataInfo()) result.setDataInfo(generateDataInfo())
result.setQualifier(createQualifier(e.classId, "dnet:externalReference_typologies")) result.setQualifier(createQualifier(e.classId, ModelConstants.DNET_EXTERNAL_REFERENCE_TYPE))
result result
}) })
publication.setExternalReference(eRefs.asJava) publication.setExternalReference(eRefs.asJava)
@ -237,7 +238,7 @@ object DLIToOAF {
if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty) if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty)
return null return null
result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava) result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava)
result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies")) result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES))
if (inputPublication.getSubject != null) if (inputPublication.getSubject != null)
result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava) result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava)
@ -258,7 +259,7 @@ object DLIToOAF {
result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue)) result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue))
result.setPublisher(inputPublication.getPublisher) result.setPublisher(inputPublication.getPublisher)
result.setSource(inputPublication.getSource) result.setSource(inputPublication.getSource)
result.setBestaccessright(createQualifier("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes")) result.setBestaccessright(createQualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue) val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue)
if (dois.isEmpty) if (dois.isEmpty)
@ -316,7 +317,7 @@ object DLIToOAF {
if (d.getAuthor == null || d.getAuthor.isEmpty) if (d.getAuthor == null || d.getAuthor.isEmpty)
return null return null
result.setAuthor(d.getAuthor.asScala.map(convertAuthor).asJava) result.setAuthor(d.getAuthor.asScala.map(convertAuthor).asJava)
result.setResulttype(createQualifier(d.getResulttype.getClassid, d.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies")) result.setResulttype(createQualifier(d.getResulttype.getClassid, d.getResulttype.getClassname, ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES))
if (d.getSubject != null) if (d.getSubject != null)
result.setSubject(d.getSubject.asScala.map(convertSubject).asJava) result.setSubject(d.getSubject.asScala.map(convertSubject).asJava)
@ -337,7 +338,7 @@ object DLIToOAF {
result.setDateofacceptance(asField(d.getRelevantdate.get(0).getValue)) result.setDateofacceptance(asField(d.getRelevantdate.get(0).getValue))
result.setPublisher(d.getPublisher) result.setPublisher(d.getPublisher)
result.setSource(d.getSource) result.setSource(d.getSource)
result.setBestaccessright(createQualifier("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes")) result.setBestaccessright(createQualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
val instance_urls = if (fpids.head.length < 5) s"https://www.rcsb.org/structure/${fpids.head}" else s"https://dx.doi.org/${fpids.head}" val instance_urls = if (fpids.head.length < 5) s"https://www.rcsb.org/structure/${fpids.head}" else s"https://dx.doi.org/${fpids.head}"
@ -364,13 +365,13 @@ object DLIToOAF {
val i = new Instance val i = new Instance
i.setUrl(List(url).asJava) i.setUrl(List(url).asJava)
if (dataset) if (dataset)
i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource")) i.setInstancetype(createQualifier("0021", "Dataset", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
else else
i.setInstancetype(createQualifier("0000", "Unknown", "dnet:publication_resource", "dnet:publication_resource")) i.setInstancetype(createQualifier("0000", "Unknown", ModelConstants.DNET_PUBLICATION_RESOURCE, ModelConstants.DNET_PUBLICATION_RESOURCE))
if (originalInstance != null && originalInstance.getHostedby != null) if (originalInstance != null && originalInstance.getHostedby != null)
i.setHostedby(originalInstance.getHostedby) i.setHostedby(originalInstance.getHostedby)
i.setAccessright(createQualifier("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes")) i.setAccessright(createQualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES))
i.setDateofacceptance(doa) i.setDateofacceptance(doa)
i i
@ -380,19 +381,19 @@ object DLIToOAF {
def patchRelevantDate(d: StructuredProperty): StructuredProperty = { def patchRelevantDate(d: StructuredProperty): StructuredProperty = {
d.setQualifier(createQualifier("UNKNOWN", "dnet:dataCite_date")) d.setQualifier(createQualifier("UNKNOWN", ModelConstants.DNET_DATA_CITE_DATE))
d d
} }
def patchTitle(t: StructuredProperty): StructuredProperty = { def patchTitle(t: StructuredProperty): StructuredProperty = {
t.setQualifier(createQualifier("main title", "dnet:dataCite_title")) t.setQualifier(createQualifier("main title","dnet:dataCite_title"))
t t
} }
def convertSubject(s: StructuredProperty): StructuredProperty = { def convertSubject(s: StructuredProperty): StructuredProperty = {
s.setQualifier(createQualifier("keyword", "dnet:subject_classification_typologies")) s.setQualifier(createQualifier("keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES))
s s