forked from antonis.lempesis/dnet-hadoop
introduced PidBlacklist
This commit is contained in:
parent
893ac4a77b
commit
943b961cf6
|
@ -1,13 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||||
|
@ -48,7 +47,8 @@ public class IdentifierFactory implements Serializable {
|
||||||
Map<String, List<StructuredProperty>> pids = entity
|
Map<String, List<StructuredProperty>> pids = entity
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(s -> pidFilter(s))
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
|
.filter(IdentifierFactory::pidFilter)
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
.groupingBy(
|
.groupingBy(
|
||||||
|
@ -83,17 +83,21 @@ public class IdentifierFactory implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static boolean pidFilter(StructuredProperty s) {
|
protected static boolean pidFilter(StructuredProperty s) {
|
||||||
|
final String pidValue = s.getValue();
|
||||||
if (Objects.isNull(s.getQualifier()) ||
|
if (Objects.isNull(s.getQualifier()) ||
|
||||||
StringUtils.isBlank(s.getValue()) ||
|
StringUtils.isBlank(pidValue) ||
|
||||||
StringUtils.isBlank(s.getValue().replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (CleaningFunctions.PID_BLACKLIST.contains(StringUtils.trim(s.getValue().toLowerCase()))) {
|
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
switch (PidType.tryValueOf(s.getQualifier().getClassid())) {
|
switch (PidType.tryValueOf(s.getQualifier().getClassid())) {
|
||||||
case doi:
|
case doi:
|
||||||
final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue()));
|
final String doi = StringUtils.trim(StringUtils.lowerCase(pidValue));
|
||||||
return doi.matches(DOI_REGEX);
|
return doi.matches(DOI_REGEX);
|
||||||
case original:
|
case original:
|
||||||
return false;
|
return false;
|
||||||
|
@ -103,13 +107,12 @@ public class IdentifierFactory implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
||||||
final String value = CleaningFunctions.normalizePidValue(s).getValue();
|
|
||||||
return new StringBuilder()
|
return new StringBuilder()
|
||||||
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
||||||
.append(ID_PREFIX_SEPARATOR)
|
.append(ID_PREFIX_SEPARATOR)
|
||||||
.append(createPrefix(s.getQualifier().getClassid()))
|
.append(createPrefix(s.getQualifier().getClassid()))
|
||||||
.append(ID_SEPARATOR)
|
.append(ID_SEPARATOR)
|
||||||
.append(md5 ? DHPUtils.md5(value) : value)
|
.append(md5 ? DHPUtils.md5(s.getValue()) : s.getValue())
|
||||||
.toString();
|
.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,8 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;public class PidBlacklist {
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
public class PidBlacklist extends HashMap<String, HashSet<String>> {
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,2 +1,37 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;public class PidBlacklistProvider {
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
public class PidBlacklistProvider {
|
||||||
|
|
||||||
|
private static final PidBlacklist blacklist;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
String json = IOUtils.toString(IdentifierFactory.class.getResourceAsStream("pid_blacklist.json"));
|
||||||
|
blacklist = new ObjectMapper().readValue(json, PidBlacklist.class);
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static PidBlacklist getBlacklist() {
|
||||||
|
return blacklist;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<String> getBlacklist(String pidType) {
|
||||||
|
return Optional
|
||||||
|
.ofNullable(getBlacklist().get(pidType))
|
||||||
|
.orElse(new HashSet<>());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,2 +1,17 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;public class BlackListProviderTest {
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
public class BlackListProviderTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void blackListTest() {
|
||||||
|
|
||||||
|
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist());
|
||||||
|
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi"));
|
||||||
|
Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0);
|
||||||
|
Assertions.assertNull(PidBlacklistProvider.getBlacklist("xxx"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[ {"qualifier":{"classid":"doi"},"value":"10.12739/10.12739"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
Loading…
Reference in New Issue