forked from antonis.lempesis/dnet-hadoop
Merge pull request 'prepare_ror_actionset' (#106) from prepare_ror_actionset into stable_ids
Reviewed-on: D-Net/dnet-hadoop#106 Thanks Michele, looks good to me.
This commit is contained in:
commit
c00be646f3
|
@ -0,0 +1,226 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.project.SparkAtomicActionJob;
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateRorActionSetJob {
|
||||
|
||||
private static final String COUNTRIES_VOC = "dnet:countries";
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateRorActionSetJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final List<KeyValue> ROR_COLLECTED_FROM = listKeyValues(
|
||||
"10|openaire____::993a7ae7a863813cf95028b50708e222", "ROR");
|
||||
|
||||
private static final DataInfo ROR_DATA_INFO = dataInfo(
|
||||
false, "", false, false, ENTITYREGISTRY_PROVENANCE_ACTION, "0.92");
|
||||
|
||||
private static final Qualifier ROR_PID_TYPE = qualifier("ROR", "ROR", "dnet:pid_types", "dnet:pid_types");
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkAtomicActionJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
processRorOrganizations(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void removeOutputDir(final SparkSession spark, final String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
private static void processRorOrganizations(final SparkSession spark,
|
||||
final String inputPath,
|
||||
final String outputPath) throws Exception {
|
||||
|
||||
readInputPath(spark, inputPath)
|
||||
.map(GenerateRorActionSetJob::convertRorOrg, Encoders.bean(Organization.class))
|
||||
.toJavaRDD()
|
||||
.map(o -> new AtomicAction<>(Organization.class, o))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
}
|
||||
|
||||
protected static Organization convertRorOrg(final RorOrganization r) {
|
||||
|
||||
final Date now = new Date();
|
||||
|
||||
final Organization o = new Organization();
|
||||
|
||||
o.setId(String.format("20|ror_________::%s", DHPUtils.md5(r.getId())));
|
||||
o.setOriginalId(Arrays.asList(r.getId()));
|
||||
o.setCollectedfrom(ROR_COLLECTED_FROM);
|
||||
o.setPid(pids(r));
|
||||
o.setDateofcollection(now.toString());
|
||||
o.setDateoftransformation(now.toString());
|
||||
o.setExtraInfo(new ArrayList<>()); // Values not present in the file
|
||||
o.setOaiprovenance(null); // Values not present in the file
|
||||
o.setLegalshortname(field(r.getAcronyms().stream().findFirst().orElse(r.getName()), ROR_DATA_INFO));
|
||||
o.setLegalname(field(r.getName(), ROR_DATA_INFO));
|
||||
o.setAlternativeNames(alternativeNames(r));
|
||||
o.setWebsiteurl(field(r.getLinks().stream().findFirst().orElse(null), ROR_DATA_INFO));
|
||||
o.setLogourl(null);
|
||||
o.setEclegalbody(null);
|
||||
o.setEclegalperson(null);
|
||||
o.setEcnonprofit(null);
|
||||
o.setEcresearchorganization(null);
|
||||
o.setEchighereducation(null);
|
||||
o.setEcinternationalorganizationeurinterests(null);
|
||||
o.setEcinternationalorganization(null);
|
||||
o.setEcenterprise(null);
|
||||
o.setEcsmevalidated(null);
|
||||
o.setEcnutscode(null);
|
||||
if (r.getCountry() != null) {
|
||||
o
|
||||
.setCountry(
|
||||
qualifier(
|
||||
r.getCountry().getCountryCode(), r.getCountry().getCountryName(), COUNTRIES_VOC,
|
||||
COUNTRIES_VOC));
|
||||
} else {
|
||||
o.setCountry(null);
|
||||
}
|
||||
o.setDataInfo(ROR_DATA_INFO);
|
||||
o.setLastupdatetimestamp(now.getTime());
|
||||
|
||||
return o;
|
||||
}
|
||||
|
||||
private static List<StructuredProperty> pids(final RorOrganization r) {
|
||||
final List<StructuredProperty> pids = new ArrayList<>();
|
||||
pids.add(structuredProperty(r.getId(), ROR_PID_TYPE, ROR_DATA_INFO));
|
||||
|
||||
for (final Map.Entry<String, ExternalIdType> e : r.getExternalIds().entrySet()) {
|
||||
final String type = e.getKey();
|
||||
final Object all = e.getValue().getAll();
|
||||
if (all == null) {
|
||||
// skip
|
||||
} else if (all instanceof String) {
|
||||
pids
|
||||
.add(
|
||||
structuredProperty(
|
||||
all.toString(), qualifier(type, type, "dnet:pid_types", "dnet:pid_types"), ROR_DATA_INFO));
|
||||
} else if (all instanceof Collection) {
|
||||
for (final Object pid : (Collection<?>) all) {
|
||||
pids
|
||||
.add(
|
||||
structuredProperty(
|
||||
pid.toString(), qualifier(type, type, "dnet:pid_types", "dnet:pid_types"),
|
||||
ROR_DATA_INFO));
|
||||
}
|
||||
} else if (all instanceof String[]) {
|
||||
for (final String pid : (String[]) all) {
|
||||
pids
|
||||
.add(
|
||||
structuredProperty(
|
||||
pid, qualifier(type, type, "dnet:pid_types", "dnet:pid_types"), ROR_DATA_INFO));
|
||||
}
|
||||
} else {
|
||||
log.warn("Invalid type for pid list: " + all.getClass());
|
||||
}
|
||||
}
|
||||
|
||||
return pids;
|
||||
}
|
||||
|
||||
private static List<Field<String>> alternativeNames(final RorOrganization r) {
|
||||
final Set<String> names = new LinkedHashSet<>();
|
||||
names.addAll(r.getAliases());
|
||||
names.addAll(r.getAcronyms());
|
||||
r.getLabels().forEach(l -> names.add(l.getLabel()));
|
||||
|
||||
return names
|
||||
.stream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(s -> field(s, ROR_DATA_INFO))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static Dataset<RorOrganization> readInputPath(
|
||||
final SparkSession spark,
|
||||
final String path) throws Exception {
|
||||
|
||||
try (final FileSystem fileSystem = FileSystem.get(new Configuration());
|
||||
final InputStream is = fileSystem.open(new Path(path))) {
|
||||
final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class);
|
||||
return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class Address implements Serializable {
|
||||
|
||||
@JsonProperty("lat")
|
||||
private Float lat;
|
||||
|
||||
@JsonProperty("state_code")
|
||||
private String stateCode;
|
||||
|
||||
@JsonProperty("country_geonames_id")
|
||||
private Integer countryGeonamesId;
|
||||
|
||||
@JsonProperty("lng")
|
||||
private Float lng;
|
||||
|
||||
@JsonProperty("state")
|
||||
private String state;
|
||||
|
||||
@JsonProperty("city")
|
||||
private String city;
|
||||
|
||||
@JsonProperty("geonames_city")
|
||||
private GeonamesCity geonamesCity;
|
||||
|
||||
@JsonProperty("postcode")
|
||||
private String postcode;
|
||||
|
||||
@JsonProperty("primary")
|
||||
private Boolean primary;
|
||||
|
||||
@JsonProperty("line")
|
||||
private String line;
|
||||
|
||||
private final static long serialVersionUID = 2444635485253443195L;
|
||||
|
||||
public Float getLat() {
|
||||
return lat;
|
||||
}
|
||||
|
||||
public void setLat(final Float lat) {
|
||||
this.lat = lat;
|
||||
}
|
||||
|
||||
public String getStateCode() {
|
||||
return stateCode;
|
||||
}
|
||||
|
||||
public void setStateCode(final String stateCode) {
|
||||
this.stateCode = stateCode;
|
||||
}
|
||||
|
||||
public Integer getCountryGeonamesId() {
|
||||
return countryGeonamesId;
|
||||
}
|
||||
|
||||
public void setCountryGeonamesId(final Integer countryGeonamesId) {
|
||||
this.countryGeonamesId = countryGeonamesId;
|
||||
}
|
||||
|
||||
public Float getLng() {
|
||||
return lng;
|
||||
}
|
||||
|
||||
public void setLng(final Float lng) {
|
||||
this.lng = lng;
|
||||
}
|
||||
|
||||
public String getState() {
|
||||
return state;
|
||||
}
|
||||
|
||||
public void setState(final String state) {
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public String getCity() {
|
||||
return city;
|
||||
}
|
||||
|
||||
public void setCity(final String city) {
|
||||
this.city = city;
|
||||
}
|
||||
|
||||
public GeonamesCity getGeonamesCity() {
|
||||
return geonamesCity;
|
||||
}
|
||||
|
||||
public void setGeonamesCity(final GeonamesCity geonamesCity) {
|
||||
this.geonamesCity = geonamesCity;
|
||||
}
|
||||
|
||||
public String getPostcode() {
|
||||
return postcode;
|
||||
}
|
||||
|
||||
public void setPostcode(final String postcode) {
|
||||
this.postcode = postcode;
|
||||
}
|
||||
|
||||
public Boolean getPrimary() {
|
||||
return primary;
|
||||
}
|
||||
|
||||
public void setPrimary(final Boolean primary) {
|
||||
this.primary = primary;
|
||||
}
|
||||
|
||||
public String getLine() {
|
||||
return line;
|
||||
}
|
||||
|
||||
public void setLine(final String line) {
|
||||
this.line = line;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class Country implements Serializable {
|
||||
|
||||
@JsonProperty("country_code")
|
||||
private String countryCode;
|
||||
|
||||
@JsonProperty("country_name")
|
||||
private String countryName;
|
||||
|
||||
private final static long serialVersionUID = 4357848706229493627L;
|
||||
|
||||
public String getCountryCode() {
|
||||
return countryCode;
|
||||
}
|
||||
|
||||
public void setCountryCode(final String countryCode) {
|
||||
this.countryCode = countryCode;
|
||||
}
|
||||
|
||||
public String getCountryName() {
|
||||
return countryName;
|
||||
}
|
||||
|
||||
public void setCountryName(final String countryName) {
|
||||
this.countryName = countryName;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class ExternalIdType implements Serializable {
|
||||
|
||||
@JsonProperty("all")
|
||||
private Object all;
|
||||
|
||||
@JsonProperty("preferred")
|
||||
private String preferred;
|
||||
|
||||
private final static long serialVersionUID = 2616688352998387611L;
|
||||
|
||||
public Object getAll() {
|
||||
return all;
|
||||
}
|
||||
|
||||
public void setAll(final Object all) {
|
||||
this.all = all;
|
||||
}
|
||||
|
||||
public String getPreferred() {
|
||||
return preferred;
|
||||
}
|
||||
|
||||
public void setPreferred(final String preferred) {
|
||||
this.preferred = preferred;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class GeonamesAdmin implements Serializable {
|
||||
|
||||
@JsonProperty("ascii_name")
|
||||
private String asciiName;
|
||||
|
||||
@JsonProperty("id")
|
||||
private Integer id;
|
||||
|
||||
@JsonProperty("name")
|
||||
private String name;
|
||||
|
||||
@JsonProperty("code")
|
||||
private String code;
|
||||
|
||||
private final static long serialVersionUID = 7294958526269195673L;
|
||||
|
||||
public String getAsciiName() {
|
||||
return asciiName;
|
||||
}
|
||||
|
||||
public void setAsciiName(final String asciiName) {
|
||||
this.asciiName = asciiName;
|
||||
}
|
||||
|
||||
public Integer getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final Integer id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(final String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class GeonamesCity implements Serializable {
|
||||
|
||||
@JsonProperty("geonames_admin1")
|
||||
private GeonamesAdmin geonamesAdmin1;
|
||||
|
||||
@JsonProperty("geonames_admin2")
|
||||
private GeonamesAdmin geonamesAdmin2;
|
||||
|
||||
@JsonProperty("city")
|
||||
private String city;
|
||||
|
||||
@JsonProperty("id")
|
||||
private Integer id;
|
||||
|
||||
@JsonProperty("nuts_level1")
|
||||
private NameAndCode nutsLevel1;
|
||||
|
||||
@JsonProperty("nuts_level2")
|
||||
private NameAndCode nutsLevel2;
|
||||
|
||||
@JsonProperty("nuts_level3")
|
||||
private NameAndCode nutsLevel3;
|
||||
|
||||
@JsonProperty("")
|
||||
private License license;
|
||||
|
||||
private final static long serialVersionUID = -8389480201526252955L;
|
||||
|
||||
public NameAndCode getNutsLevel2() {
|
||||
return nutsLevel2;
|
||||
}
|
||||
|
||||
public void setNutsLevel2(final NameAndCode nutsLevel2) {
|
||||
this.nutsLevel2 = nutsLevel2;
|
||||
}
|
||||
|
||||
public GeonamesAdmin getGeonamesAdmin2() {
|
||||
return geonamesAdmin2;
|
||||
}
|
||||
|
||||
public void setGeonamesAdmin2(final GeonamesAdmin geonamesAdmin2) {
|
||||
this.geonamesAdmin2 = geonamesAdmin2;
|
||||
}
|
||||
|
||||
public GeonamesAdmin getGeonamesAdmin1() {
|
||||
return geonamesAdmin1;
|
||||
}
|
||||
|
||||
public void setGeonamesAdmin1(final GeonamesAdmin geonamesAdmin1) {
|
||||
this.geonamesAdmin1 = geonamesAdmin1;
|
||||
}
|
||||
|
||||
public String getCity() {
|
||||
return city;
|
||||
}
|
||||
|
||||
public void setCity(final String city) {
|
||||
this.city = city;
|
||||
}
|
||||
|
||||
public Integer getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final Integer id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public NameAndCode getNutsLevel1() {
|
||||
return nutsLevel1;
|
||||
}
|
||||
|
||||
public void setNutsLevel1(final NameAndCode nutsLevel1) {
|
||||
this.nutsLevel1 = nutsLevel1;
|
||||
}
|
||||
|
||||
public NameAndCode getNutsLevel3() {
|
||||
return nutsLevel3;
|
||||
}
|
||||
|
||||
public void setNutsLevel3(final NameAndCode nutsLevel3) {
|
||||
this.nutsLevel3 = nutsLevel3;
|
||||
}
|
||||
|
||||
public License getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
||||
public void setLicense(final License license) {
|
||||
this.license = license;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class Label implements Serializable {
|
||||
|
||||
@JsonProperty("iso639")
|
||||
private String iso639;
|
||||
|
||||
@JsonProperty("label")
|
||||
private String label;
|
||||
|
||||
private final static long serialVersionUID = -6576156103297850809L;
|
||||
|
||||
public String getIso639() {
|
||||
return iso639;
|
||||
}
|
||||
|
||||
public void setIso639(final String iso639) {
|
||||
this.iso639 = iso639;
|
||||
}
|
||||
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
|
||||
public void setLabel(final String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class License implements Serializable {
|
||||
|
||||
@JsonProperty("attribution")
|
||||
private String attribution;
|
||||
|
||||
@JsonProperty("license")
|
||||
private String license;
|
||||
|
||||
private final static long serialVersionUID = -194308261058176439L;
|
||||
|
||||
public String getAttribution() {
|
||||
return attribution;
|
||||
}
|
||||
|
||||
public void setAttribution(final String attribution) {
|
||||
this.attribution = attribution;
|
||||
}
|
||||
|
||||
public String getLicense() {
|
||||
return license;
|
||||
}
|
||||
|
||||
public void setLicense(final String license) {
|
||||
this.license = license;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class NameAndCode implements Serializable {
|
||||
|
||||
@JsonProperty("name")
|
||||
private String name;
|
||||
|
||||
@JsonProperty("code")
|
||||
private String code;
|
||||
|
||||
private final static long serialVersionUID = 5459836979206140843L;
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public void setCode(final String code) {
|
||||
this.code = code;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class Relationship implements Serializable {
|
||||
|
||||
@JsonProperty("type")
|
||||
private String type;
|
||||
|
||||
@JsonProperty("id")
|
||||
private String id;
|
||||
|
||||
@JsonProperty("label")
|
||||
private String label;
|
||||
|
||||
private final static long serialVersionUID = 7847399503395576960L;
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
|
||||
public void setLabel(final String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,192 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
public class RorOrganization implements Serializable {
|
||||
|
||||
@JsonProperty("ip_addresses")
|
||||
private List<String> ipAddresses = new ArrayList<>();
|
||||
|
||||
@JsonProperty("aliases")
|
||||
private List<String> aliases = new ArrayList<>();
|
||||
|
||||
@JsonProperty("acronyms")
|
||||
private List<String> acronyms = new ArrayList<>();
|
||||
|
||||
@JsonProperty("links")
|
||||
private List<String> links = new ArrayList<>();
|
||||
|
||||
@JsonProperty("country")
|
||||
private Country country;
|
||||
|
||||
@JsonProperty("name")
|
||||
private String name;
|
||||
|
||||
@JsonProperty("wikipedia_url")
|
||||
private String wikipediaUrl;
|
||||
|
||||
@JsonProperty("addresses")
|
||||
private List<Address> addresses = new ArrayList<>();
|
||||
|
||||
@JsonProperty("types")
|
||||
private List<String> types = new ArrayList<>();
|
||||
|
||||
@JsonProperty("established")
|
||||
private Integer established;
|
||||
|
||||
@JsonProperty("relationships")
|
||||
private List<Relationship> relationships = new ArrayList<>();
|
||||
|
||||
@JsonProperty("email_address")
|
||||
private String emailAddress;
|
||||
|
||||
@JsonProperty("external_ids")
|
||||
private Map<String, ExternalIdType> externalIds = new LinkedHashMap<>();
|
||||
|
||||
@JsonProperty("id")
|
||||
private String id;
|
||||
|
||||
@JsonProperty("labels")
|
||||
private List<Label> labels = new ArrayList<>();
|
||||
|
||||
@JsonProperty("status")
|
||||
private String status;
|
||||
|
||||
private final static long serialVersionUID = -2658312087616043225L;
|
||||
|
||||
public List<String> getIpAddresses() {
|
||||
return ipAddresses;
|
||||
}
|
||||
|
||||
public void setIpAddresses(final List<String> ipAddresses) {
|
||||
this.ipAddresses = ipAddresses;
|
||||
}
|
||||
|
||||
public List<String> getAliases() {
|
||||
return aliases;
|
||||
}
|
||||
|
||||
public void setAliases(final List<String> aliases) {
|
||||
this.aliases = aliases;
|
||||
}
|
||||
|
||||
public List<String> getAcronyms() {
|
||||
return acronyms;
|
||||
}
|
||||
|
||||
public void setAcronyms(final List<String> acronyms) {
|
||||
this.acronyms = acronyms;
|
||||
}
|
||||
|
||||
public List<String> getLinks() {
|
||||
return links;
|
||||
}
|
||||
|
||||
public void setLinks(final List<String> links) {
|
||||
this.links = links;
|
||||
}
|
||||
|
||||
public Country getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public void setCountry(final Country country) {
|
||||
this.country = country;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getWikipediaUrl() {
|
||||
return wikipediaUrl;
|
||||
}
|
||||
|
||||
public void setWikipediaUrl(final String wikipediaUrl) {
|
||||
this.wikipediaUrl = wikipediaUrl;
|
||||
}
|
||||
|
||||
public List<Address> getAddresses() {
|
||||
return addresses;
|
||||
}
|
||||
|
||||
public void setAddresses(final List<Address> addresses) {
|
||||
this.addresses = addresses;
|
||||
}
|
||||
|
||||
public List<String> getTypes() {
|
||||
return types;
|
||||
}
|
||||
|
||||
public void setTypes(final List<String> types) {
|
||||
this.types = types;
|
||||
}
|
||||
|
||||
public Integer getEstablished() {
|
||||
return established;
|
||||
}
|
||||
|
||||
public void setEstablished(final Integer established) {
|
||||
this.established = established;
|
||||
}
|
||||
|
||||
public List<Relationship> getRelationships() {
|
||||
return relationships;
|
||||
}
|
||||
|
||||
public void setRelationships(final List<Relationship> relationships) {
|
||||
this.relationships = relationships;
|
||||
}
|
||||
|
||||
public String getEmailAddress() {
|
||||
return emailAddress;
|
||||
}
|
||||
|
||||
public void setEmailAddress(final String emailAddress) {
|
||||
this.emailAddress = emailAddress;
|
||||
}
|
||||
|
||||
public Map<String, ExternalIdType> getExternalIds() {
|
||||
return externalIds;
|
||||
}
|
||||
|
||||
public void setExternalIds(final Map<String, ExternalIdType> externalIds) {
|
||||
this.externalIds = externalIds;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<Label> getLabels() {
|
||||
return labels;
|
||||
}
|
||||
|
||||
public void setLabels(final List<Label> labels) {
|
||||
this.labels = labels;
|
||||
}
|
||||
|
||||
public String getStatus() {
|
||||
return status;
|
||||
}
|
||||
|
||||
public void setStatus(final String status) {
|
||||
this.status = status;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the path of the input json",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,58 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorNumber</name>
|
||||
<value>4</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<value>15G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<value>6G</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<value>1</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,55 @@
|
|||
<workflow-app name="Update_ROR_action_set" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>rorJsonInputPath</name>
|
||||
<description>the path of the json</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>rorActionSetPath</name>
|
||||
<description>path where to store the action set</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="deleteoutputpath">
|
||||
<fs>
|
||||
<delete path='${rorActionSetPath}'/>
|
||||
<mkdir path='${rorActionSetPath}'/>
|
||||
<delete path='${workingDir}'/>
|
||||
<mkdir path='${workingDir}'/>
|
||||
</fs>
|
||||
<ok to="processRorFile"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="processRorFile">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ProcessRorFile</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.ror.GenerateRorActionSetJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${rorJsonInputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${rorActionSetPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.ror;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
|
||||
@Disabled
|
||||
class GenerateRorActionSetJobTest {
|
||||
|
||||
private static final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
private static final String local_file_path = "/Users/michele/Downloads/ror-data-2021-04-06.json";
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testConvertRorOrg() throws Exception {
|
||||
final RorOrganization r = mapper
|
||||
.readValue(IOUtils.toString(getClass().getResourceAsStream("ror_org.json")), RorOrganization.class);
|
||||
final Organization org = GenerateRorActionSetJob.convertRorOrg(r);
|
||||
|
||||
System.out.println(mapper.writeValueAsString(org));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testConvertAllRorOrg() throws Exception {
|
||||
final RorOrganization[] arr = mapper
|
||||
.readValue(IOUtils.toString(new FileInputStream(local_file_path)), RorOrganization[].class);
|
||||
|
||||
for (final RorOrganization r : arr) {
|
||||
GenerateRorActionSetJob.convertRorOrg(r);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
{
|
||||
"ip_addresses": [],
|
||||
"aliases": [],
|
||||
"acronyms": [
|
||||
"ANU"
|
||||
],
|
||||
"links": [
|
||||
"http://www.anu.edu.au/"
|
||||
],
|
||||
"country": {
|
||||
"country_code": "AU",
|
||||
"country_name": "Australia"
|
||||
},
|
||||
"name": "Australian National University",
|
||||
"wikipedia_url": "http://en.wikipedia.org/wiki/Australian_National_University",
|
||||
"addresses": [
|
||||
{
|
||||
"lat": -35.2778,
|
||||
"state_code": "AU-ACT",
|
||||
"country_geonames_id": 2077456,
|
||||
"lng": 149.1205,
|
||||
"state": "Australian Capital Territory",
|
||||
"city": "Canberra",
|
||||
"geonames_city": {
|
||||
"nuts_level2": {
|
||||
"name": null,
|
||||
"code": null
|
||||
},
|
||||
"geonames_admin2": {
|
||||
"ascii_name": null,
|
||||
"id": null,
|
||||
"name": null,
|
||||
"code": null
|
||||
},
|
||||
"geonames_admin1": {
|
||||
"ascii_name": "ACT",
|
||||
"id": 2177478,
|
||||
"name": "ACT",
|
||||
"code": "AU.01"
|
||||
},
|
||||
"city": "Canberra",
|
||||
"id": 2172517,
|
||||
"nuts_level1": {
|
||||
"name": null,
|
||||
"code": null
|
||||
},
|
||||
"nuts_level3": {
|
||||
"name": null,
|
||||
"code": null
|
||||
},
|
||||
"license": {
|
||||
"attribution": "Data from geonames.org under a CC-BY 3.0 license",
|
||||
"license": "http://creativecommons.org/licenses/by/3.0/"
|
||||
}
|
||||
},
|
||||
"postcode": null,
|
||||
"primary": false,
|
||||
"line": null
|
||||
}
|
||||
],
|
||||
"types": [
|
||||
"Education"
|
||||
],
|
||||
"established": 1946,
|
||||
"relationships": [
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/041c7s516",
|
||||
"label": "Calvary Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/04h7nbn38",
|
||||
"label": "Canberra Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Related",
|
||||
"id": "https://ror.org/030jpqj15",
|
||||
"label": "Goulburn Base Hospital"
|
||||
},
|
||||
{
|
||||
"type": "Child",
|
||||
"id": "https://ror.org/006a4jj40",
|
||||
"label": "Mount Stromlo Observatory"
|
||||
}
|
||||
],
|
||||
"email_address": null,
|
||||
"external_ids": {
|
||||
"Wikidata": {
|
||||
"all": [
|
||||
"Q127990"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"OrgRef": {
|
||||
"all": [
|
||||
"285106"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"ISNI": {
|
||||
"all": [
|
||||
"0000 0001 2180 7477"
|
||||
],
|
||||
"preferred": null
|
||||
},
|
||||
"FundRef": {
|
||||
"all": [
|
||||
"501100000995",
|
||||
"501100001151",
|
||||
"100009020"
|
||||
],
|
||||
"preferred": "501100000995"
|
||||
},
|
||||
"GRID": {
|
||||
"all": "grid.1001.0",
|
||||
"preferred": "grid.1001.0"
|
||||
}
|
||||
},
|
||||
"id": "https://ror.org/019wvm592",
|
||||
"labels": [],
|
||||
"status": "active"
|
||||
}
|
Loading…
Reference in New Issue