forked from D-Net/dnet-hadoop
cleaning tab characters from text fields
This commit is contained in:
parent
596a2a459d
commit
758d27745d
|
@ -15,7 +15,7 @@ public class CleaningFunctions {
|
||||||
|
|
||||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
||||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
||||||
public static final String NEWLINES = "(?:\\n|\\r)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
|
|
||||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||||
|
|
||||||
|
@ -109,7 +109,7 @@ public class CleaningFunctions {
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
.map(CleaningFunctions::removeNewLines)
|
.map(CleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getTitle())) {
|
if (Objects.nonNull(r.getTitle())) {
|
||||||
|
@ -120,7 +120,7 @@ public class CleaningFunctions {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
.map(CleaningFunctions::removeNewLines)
|
.map(CleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getDescription())) {
|
if (Objects.nonNull(r.getDescription())) {
|
||||||
|
@ -131,7 +131,7 @@ public class CleaningFunctions {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
.map(CleaningFunctions::removeNewLines)
|
.map(CleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getPid())) {
|
if (Objects.nonNull(r.getPid())) {
|
||||||
|
@ -228,13 +228,13 @@ public class CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static StructuredProperty removeNewLines(StructuredProperty s) {
|
protected static StructuredProperty cleanValue(StructuredProperty s) {
|
||||||
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
|
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static Field<String> removeNewLines(Field<String> s) {
|
protected static Field<String> cleanValue(Field<String> s) {
|
||||||
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
|
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
|
||||||
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
@ -8,8 +10,8 @@ import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
|
|
||||||
|
|
||||||
public class ResultTypeComparator implements Comparator<Result> {
|
public class ResultTypeComparator implements Comparator<Result> {
|
||||||
|
|
||||||
|
@ -64,10 +66,13 @@ public class ResultTypeComparator implements Comparator<Result> {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected HashSet<String> getCollectedFromIds(Result left) {
|
protected HashSet<String> getCollectedFromIds(Result left) {
|
||||||
return Optional.ofNullable(left.getCollectedfrom())
|
return Optional
|
||||||
.map(cf -> cf.stream()
|
.ofNullable(left.getCollectedfrom())
|
||||||
.map(c -> c.getKey())
|
.map(
|
||||||
.collect(Collectors.toCollection(HashSet::new)))
|
cf -> cf
|
||||||
.orElse(new HashSet<>());
|
.stream()
|
||||||
|
.map(c -> c.getKey())
|
||||||
|
.collect(Collectors.toCollection(HashSet::new)))
|
||||||
|
.orElse(new HashSet<>());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,56 +1,69 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import it.unimi.dsi.fastutil.Hash;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import it.unimi.dsi.fastutil.Hash;
|
||||||
|
|
||||||
public class OafMapperUtilsTest {
|
public class OafMapperUtilsTest {
|
||||||
|
|
||||||
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMergePubs() throws IOException {
|
public void testMergePubs() throws IOException {
|
||||||
Publication p1 = read("publication_1.json", Publication.class);
|
Publication p1 = read("publication_1.json", Publication.class);
|
||||||
Publication p2 = read("publication_2.json", Publication.class);
|
Publication p2 = read("publication_2.json", Publication.class);
|
||||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||||
Dataset d2 = read("dataset_2.json", Dataset.class);
|
Dataset d2 = read("dataset_2.json", Dataset.class);
|
||||||
|
|
||||||
assertEquals(p1.getCollectedfrom().size(), 1);
|
assertEquals(p1.getCollectedfrom().size(), 1);
|
||||||
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
|
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
|
||||||
assertEquals(d2.getCollectedfrom().size(), 1);
|
assertEquals(d2.getCollectedfrom().size(), 1);
|
||||||
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||||
|
|
||||||
assertTrue(OafMapperUtils.mergeResults(p1, d2).getResulttype().getClassid().equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
|
assertTrue(
|
||||||
|
OafMapperUtils
|
||||||
|
.mergeResults(p1, d2)
|
||||||
|
.getResulttype()
|
||||||
|
.getClassid()
|
||||||
|
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
|
||||||
|
|
||||||
assertEquals(p2.getCollectedfrom().size(), 1);
|
assertEquals(p2.getCollectedfrom().size(), 1);
|
||||||
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||||
assertEquals(d1.getCollectedfrom().size(), 1);
|
assertEquals(d1.getCollectedfrom().size(), 1);
|
||||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||||
|
|
||||||
assertTrue(OafMapperUtils.mergeResults(p2, d1).getResulttype().getClassid().equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
|
assertTrue(
|
||||||
}
|
OafMapperUtils
|
||||||
|
.mergeResults(p2, d1)
|
||||||
|
.getResulttype()
|
||||||
|
.getClassid()
|
||||||
|
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
|
||||||
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
|
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
|
||||||
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
|
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
|
||||||
}
|
}
|
||||||
|
|
||||||
protected <T extends Result> T read(String filename, Class<T> clazz ) throws IOException {
|
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
|
||||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||||
return OBJECT_MAPPER.readValue(json, clazz);
|
return OBJECT_MAPPER.readValue(json, clazz);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,13 +7,13 @@ import java.text.SimpleDateFormat;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
||||||
|
|
Loading…
Reference in New Issue