forked from D-Net/dnet-hadoop
code formatting
This commit is contained in:
parent
1abcabb6e6
commit
8958f20813
|
@ -1,13 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory class for OpenAIRE identifiers in the Graph
|
* Factory class for OpenAIRE identifiers in the Graph
|
||||||
|
@ -16,7 +17,8 @@ public class IdentifierFactory implements Serializable {
|
||||||
|
|
||||||
public static final String ID_SEPARATOR = "::";
|
public static final String ID_SEPARATOR = "::";
|
||||||
public static final String ID_PREFIX_SEPARATOR = "|";
|
public static final String ID_PREFIX_SEPARATOR = "|";
|
||||||
public final static String ID_REGEX = "^[0-9][0-9]\\"+ID_PREFIX_SEPARATOR+".{12}"+ID_SEPARATOR+"[a-zA-Z0-9]{32}$";
|
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
|
||||||
|
+ "[a-zA-Z0-9]{32}$";
|
||||||
public static final int ID_PREFIX_LEN = 12;
|
public static final int ID_PREFIX_LEN = 12;
|
||||||
|
|
||||||
public static <T extends OafEntity> String createIdentifier(T entity) {
|
public static <T extends OafEntity> String createIdentifier(T entity) {
|
||||||
|
@ -26,18 +28,18 @@ public class IdentifierFactory implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
return entity
|
return entity
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||||
.filter(s -> PidType.isValid(s.getQualifier().getClassid()))
|
.filter(s -> PidType.isValid(s.getQualifier().getClassid()))
|
||||||
.min(new PidComparator<>(entity))
|
.min(new PidComparator<>(entity))
|
||||||
.map(s -> idFromPid(entity, s))
|
.map(s -> idFromPid(entity, s))
|
||||||
.map(IdentifierFactory::verifyIdSyntax)
|
.map(IdentifierFactory::verifyIdSyntax)
|
||||||
.orElseGet(entity::getId);
|
.orElseGet(entity::getId);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String verifyIdSyntax(String s) {
|
private static String verifyIdSyntax(String s) {
|
||||||
if(StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
|
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
|
||||||
throw new RuntimeException(String.format("malformed id: '%s'", s));
|
throw new RuntimeException(String.format("malformed id: '%s'", s));
|
||||||
} else {
|
} else {
|
||||||
return s;
|
return s;
|
||||||
|
@ -46,16 +48,16 @@ public class IdentifierFactory implements Serializable {
|
||||||
|
|
||||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
|
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
|
||||||
return new StringBuilder()
|
return new StringBuilder()
|
||||||
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
||||||
.append(ID_PREFIX_SEPARATOR)
|
.append(ID_PREFIX_SEPARATOR)
|
||||||
.append(createPrefix(s.getQualifier().getClassid()))
|
.append(createPrefix(s.getQualifier().getClassid()))
|
||||||
.append(ID_SEPARATOR)
|
.append(ID_SEPARATOR)
|
||||||
.append(DHPUtils.md5(normalizePidValue(s.getValue())))
|
.append(DHPUtils.md5(normalizePidValue(s.getValue())))
|
||||||
.toString();
|
.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String normalizePidValue(String value) {
|
private static String normalizePidValue(String value) {
|
||||||
//TODO more aggressive cleaning? keep only alphanum and punctuation?
|
// TODO more aggressive cleaning? keep only alphanum and punctuation?
|
||||||
return value.toLowerCase().replaceAll(" ", "");
|
return value.toLowerCase().replaceAll(" ", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,18 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
import org.apache.commons.lang3.EnumUtils;
|
import org.apache.commons.lang3.EnumUtils;
|
||||||
|
|
||||||
public enum PidType {
|
public enum PidType {
|
||||||
|
|
||||||
// Result
|
// Result
|
||||||
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
|
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
|
||||||
|
|
||||||
// Organization
|
// Organization
|
||||||
GRID, mag_id, urn;
|
GRID, mag_id, urn;
|
||||||
|
|
||||||
public static boolean isValid(String type) {
|
public static boolean isValid(String type) {
|
||||||
return EnumUtils.isValidEnum(PidType.class, type);
|
return EnumUtils.isValidEnum(PidType.class, type);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,43 +1,47 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
public class IdentifierFactoryTest {
|
public class IdentifierFactoryTest {
|
||||||
|
|
||||||
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCreateIdentifierForPublication() throws IOException {
|
public void testCreateIdentifierForPublication() throws IOException {
|
||||||
|
|
||||||
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
||||||
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
||||||
verifyIdentifier("publication_urn.json", "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
verifyIdentifier(
|
||||||
|
"publication_urn.json",
|
||||||
|
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
||||||
|
|
||||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||||
verifyIdentifier("publication_3.json", defaultID);
|
verifyIdentifier("publication_3.json", defaultID);
|
||||||
verifyIdentifier("publication_4.json", defaultID);
|
verifyIdentifier("publication_4.json", defaultID);
|
||||||
verifyIdentifier("publication_5.json", defaultID);
|
verifyIdentifier("publication_5.json", defaultID);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
|
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
|
||||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||||
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
|
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
String id = IdentifierFactory.createIdentifier(pub);
|
String id = IdentifierFactory.createIdentifier(pub);
|
||||||
|
|
||||||
assertNotNull(id);
|
|
||||||
assertEquals(expectedID, id);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
assertNotNull(id);
|
||||||
|
assertEquals(expectedID, id);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,6 @@ import java.util.Objects;
|
||||||
* - private String description to store the description of the programme
|
* - private String description to store the description of the programme
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
public class H2020Programme implements Serializable {
|
public class H2020Programme implements Serializable {
|
||||||
private String code;
|
private String code;
|
||||||
private String description;
|
private String description;
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
||||||
|
|
||||||
public class Result extends OafEntity implements Serializable {
|
public class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
private List<Measure> measures;
|
private List<Measure> measures;
|
||||||
|
@ -247,7 +247,8 @@ public class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
instance = mergeLists(instance, r.getInstance());
|
instance = mergeLists(instance, r.getInstance());
|
||||||
|
|
||||||
if (r.getBestaccessright() != null && new LicenseComparator().compare(r.getBestaccessright(), bestaccessright) < 0)
|
if (r.getBestaccessright() != null
|
||||||
|
&& new LicenseComparator().compare(r.getBestaccessright(), bestaccessright) < 0)
|
||||||
bestaccessright = r.getBestaccessright();
|
bestaccessright = r.getBestaccessright();
|
||||||
|
|
||||||
if (r.getResulttype() != null && compareTrust(this, r) < 0)
|
if (r.getResulttype() != null && compareTrust(this, r) < 0)
|
||||||
|
|
|
@ -177,14 +177,12 @@ public class PrepareProgramme {
|
||||||
|
|
||||||
prepareClassification(h2020Programmes);
|
prepareClassification(h2020Programmes);
|
||||||
|
|
||||||
h2020Programmes.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme))
|
h2020Programmes
|
||||||
.saveAsTextFile(outputPath);
|
.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme))
|
||||||
|
.saveAsTextFile(outputPath);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
|
private static void prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
|
||||||
Object[] codedescription = h2020Programmes
|
Object[] codedescription = h2020Programmes
|
||||||
.map(value -> new Tuple2<>(value.getCode(), value.getTitle()))
|
.map(value -> new Tuple2<>(value.getCode(), value.getTitle()))
|
||||||
|
@ -255,7 +253,7 @@ public class PrepareProgramme {
|
||||||
}
|
}
|
||||||
h2020Programmes.foreach(csvProgramme -> {
|
h2020Programmes.foreach(csvProgramme -> {
|
||||||
if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
|
if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
|
||||||
&& !csvProgramme.getCode().equals("H2020-EC"))
|
&& !csvProgramme.getCode().equals("H2020-EC"))
|
||||||
csvProgramme.setClassification(map.get(csvProgramme.getCode() + "."));
|
csvProgramme.setClassification(map.get(csvProgramme.getCode() + "."));
|
||||||
else
|
else
|
||||||
csvProgramme.setClassification(map.get(csvProgramme.getCode()));
|
csvProgramme.setClassification(map.get(csvProgramme.getCode()));
|
||||||
|
|
|
@ -10,7 +10,6 @@ public class ProjectSubset implements Serializable {
|
||||||
|
|
||||||
private String code;
|
private String code;
|
||||||
|
|
||||||
|
|
||||||
public String getCode() {
|
public String getCode() {
|
||||||
return code;
|
return code;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue