introduced PidBlacklist

This commit is contained in:
Claudio Atzori 2020-12-02 09:30:34 +01:00
parent 893ac4a77b
commit 943b961cf6
6 changed files with 79 additions and 15 deletions

View File

@ -1,13 +1,12 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.*;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
@ -48,7 +47,8 @@ public class IdentifierFactory implements Serializable {
Map<String, List<StructuredProperty>> pids = entity Map<String, List<StructuredProperty>> pids = entity
.getPid() .getPid()
.stream() .stream()
.filter(s -> pidFilter(s)) .map(CleaningFunctions::normalizePidValue)
.filter(IdentifierFactory::pidFilter)
.collect( .collect(
Collectors Collectors
.groupingBy( .groupingBy(
@ -83,17 +83,21 @@ public class IdentifierFactory implements Serializable {
} }
protected static boolean pidFilter(StructuredProperty s) { protected static boolean pidFilter(StructuredProperty s) {
final String pidValue = s.getValue();
if (Objects.isNull(s.getQualifier()) || if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(s.getValue()) || StringUtils.isBlank(pidValue) ||
StringUtils.isBlank(s.getValue().replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) { StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
return false; return false;
} }
if (CleaningFunctions.PID_BLACKLIST.contains(StringUtils.trim(s.getValue().toLowerCase()))) { if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
return false;
}
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
return false; return false;
} }
switch (PidType.tryValueOf(s.getQualifier().getClassid())) { switch (PidType.tryValueOf(s.getQualifier().getClassid())) {
case doi: case doi:
final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue())); final String doi = StringUtils.trim(StringUtils.lowerCase(pidValue));
return doi.matches(DOI_REGEX); return doi.matches(DOI_REGEX);
case original: case original:
return false; return false;
@ -103,13 +107,12 @@ public class IdentifierFactory implements Serializable {
} }
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) { private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
final String value = CleaningFunctions.normalizePidValue(s).getValue();
return new StringBuilder() return new StringBuilder()
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR)) .append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
.append(ID_PREFIX_SEPARATOR) .append(ID_PREFIX_SEPARATOR)
.append(createPrefix(s.getQualifier().getClassid())) .append(createPrefix(s.getQualifier().getClassid()))
.append(ID_SEPARATOR) .append(ID_SEPARATOR)
.append(md5 ? DHPUtils.md5(value) : value) .append(md5 ? DHPUtils.md5(s.getValue()) : s.getValue())
.toString(); .toString();
} }

View File

@ -1,2 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils;public class PidBlacklist {
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.HashMap;
import java.util.HashSet;
public class PidBlacklist extends HashMap<String, HashSet<String>> {
} }

View File

@ -1,2 +1,37 @@
package eu.dnetlib.dhp.schema.oaf.utils;public class PidBlacklistProvider {
package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.IOException;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
public class PidBlacklistProvider {
private static final PidBlacklist blacklist;
static {
try {
String json = IOUtils.toString(IdentifierFactory.class.getResourceAsStream("pid_blacklist.json"));
blacklist = new ObjectMapper().readValue(json, PidBlacklist.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static PidBlacklist getBlacklist() {
return blacklist;
}
public static Set<String> getBlacklist(String pidType) {
return Optional
.ofNullable(getBlacklist().get(pidType))
.orElse(new HashSet<>());
}
} }

File diff suppressed because one or more lines are too long

View File

@ -1,2 +1,17 @@
package eu.dnetlib.dhp.schema.oaf.utils;public class BlackListProviderTest {
package eu.dnetlib.dhp.schema.oaf.utils;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class BlackListProviderTest {
@Test
public void blackListTest() {
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist());
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi"));
Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0);
Assertions.assertNull(PidBlacklistProvider.getBlacklist("xxx"));
}
} }

View File

@ -1 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]} {"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[ {"qualifier":{"classid":"doi"},"value":"10.12739/10.12739"},{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}