forked from D-Net/dnet-hadoop
moved class CleaningFunctions in the correct package. Remove newlines from titles, descriptions, subjects
This commit is contained in:
parent
33bae02451
commit
e1a1bb3ee4
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
|
@ -10,12 +10,12 @@ import org.apache.commons.lang3.StringUtils;
|
|||
import com.clearspring.analytics.util.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class CleaningFunctions {
|
||||
|
||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
||||
public static final String NEWLINES = "(?:\\n|\\r)";
|
||||
|
||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||
|
||||
|
@ -76,7 +76,7 @@ public class CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
protected static <T extends Oaf> T fixDefaults(T value) {
|
||||
public static <T extends Oaf> T cleanup(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
|
@ -109,6 +109,29 @@ public class CleaningFunctions {
|
|||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(CleaningFunctions::removeNewLines)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getTitle())) {
|
||||
r
|
||||
.setTitle(
|
||||
r
|
||||
.getTitle()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(CleaningFunctions::removeNewLines)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getDescription())) {
|
||||
r
|
||||
.setDescription(
|
||||
r
|
||||
.getDescription()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.map(CleaningFunctions::removeNewLines)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getPid())) {
|
||||
|
@ -205,6 +228,16 @@ public class CleaningFunctions {
|
|||
return value;
|
||||
}
|
||||
|
||||
protected static StructuredProperty removeNewLines(StructuredProperty s) {
|
||||
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
|
||||
return s;
|
||||
}
|
||||
|
||||
protected static Field<String> removeNewLines(Field<String> s) {
|
||||
s.setValue(s.getValue().replaceAll(NEWLINES, " "));
|
||||
return s;
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private static void fixVocabName(Qualifier q, String vocabularyName) {
|
||||
|
|
|
@ -7,11 +7,10 @@ import java.util.Map;
|
|||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
|
|
@ -4,12 +4,7 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
|||
import java.util.Comparator;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class PidValueComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.oaf.CleaningFunctions.*;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
|
@ -26,7 +27,6 @@ import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import static eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions.*;
|
||||
|
||||
public class CleanGraphSparkJob {
|
||||
|
||||
|
@ -89,7 +89,7 @@ public class CleanGraphSparkJob {
|
|||
readTableFromPath(spark, inputPath, clazz)
|
||||
.map((MapFunction<T, T>) value -> fixVocabularyNames(value), Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) value -> fixDefaults(value), Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) value -> cleanup(value), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
|
|
@ -16,9 +16,9 @@ import org.dom4j.Node;
|
|||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
|
|
|
@ -12,7 +12,6 @@ import org.dom4j.Document;
|
|||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
|
|
|
@ -19,10 +19,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
|
@ -89,7 +86,7 @@ public class CleaningFunctionTest {
|
|||
.map(p -> p.getQualifier())
|
||||
.allMatch(q -> pidTerms.contains(q.getClassid())));
|
||||
|
||||
Publication p_defaults = CleaningFunctions.fixDefaults(p_out);
|
||||
Publication p_defaults = CleaningFunctions.cleanup(p_out);
|
||||
assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid());
|
||||
assertNull(p_out.getPublisher());
|
||||
|
||||
|
|
Loading…
Reference in New Issue