forked from D-Net/dnet-hadoop
[cleaning] title cleaning based on the me.xuender:unidecode library
This commit is contained in:
parent
2fff24df55
commit
6dddad86ee
|
@ -25,6 +25,11 @@
|
||||||
<groupId>com.github.sisyphsu</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>me.xuender</groupId>
|
||||||
|
<artifactId>unidecode</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
|
|
@ -7,22 +7,19 @@ import java.time.format.DateTimeFormatter;
|
||||||
import java.time.format.DateTimeParseException;
|
import java.time.format.DateTimeParseException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
|
|
||||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import me.xuender.unidecode.Unidecode;
|
||||||
|
|
||||||
public class GraphCleaningFunctions extends CleaningFunctions {
|
public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
|
|
||||||
|
@ -194,11 +191,15 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
.filter(
|
.filter(
|
||||||
sp -> sp
|
sp -> {
|
||||||
.getValue()
|
final String title = sp
|
||||||
.toLowerCase()
|
.getValue()
|
||||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
.toLowerCase();
|
||||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH)
|
final String residual = Unidecode
|
||||||
|
.decode(title)
|
||||||
|
.replaceAll(TITLE_FILTER_REGEX, "");
|
||||||
|
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||||
|
})
|
||||||
.map(GraphCleaningFunctions::cleanValue)
|
.map(GraphCleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,12 +4,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.time.LocalDate;
|
|
||||||
import java.time.format.DateTimeFormatter;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -19,13 +15,32 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import me.xuender.unidecode.Unidecode;
|
||||||
|
|
||||||
public class OafMapperUtilsTest {
|
public class OafMapperUtilsTest {
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testUnidecode() {
|
||||||
|
|
||||||
|
assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
|
||||||
|
assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
|
||||||
|
assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
|
||||||
|
assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
|
||||||
|
assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
|
||||||
|
assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
|
||||||
|
assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
|
||||||
|
assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
|
||||||
|
assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
|
||||||
|
assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDateValidation() {
|
public void testDateValidation() {
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.cleanup;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions.fixVocabularyNames;
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
import static org.mockito.Mockito.lenient;
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
|
@ -640,6 +642,30 @@ public class MappersTest {
|
||||||
System.out.println(p.getTitle().get(0).getValue());
|
System.out.println(p.getTitle().get(0).getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testJairo() throws IOException {
|
||||||
|
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_jairo.xml"));
|
||||||
|
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||||
|
|
||||||
|
System.out.println("***************");
|
||||||
|
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||||
|
System.out.println("***************");
|
||||||
|
|
||||||
|
final Publication p = (Publication) list.get(0);
|
||||||
|
assertValidId(p.getId());
|
||||||
|
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||||
|
|
||||||
|
assertNotNull(p.getTitle());
|
||||||
|
assertFalse(p.getTitle().isEmpty());
|
||||||
|
assertTrue(p.getTitle().size() == 1);
|
||||||
|
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||||
|
|
||||||
|
final Publication p_cleaned = cleanup(fixVocabularyNames(p));
|
||||||
|
|
||||||
|
assertNotNull(p_cleaned.getTitle());
|
||||||
|
assertFalse(p_cleaned.getTitle().isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testOdfFromHdfs() throws IOException {
|
void testOdfFromHdfs() throws IOException {
|
||||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));
|
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_from_hdfs.xml"));
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||||
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
|
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||||
|
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
<header xmlns="http://namespace.openaire.eu/">
|
||||||
|
<dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
|
||||||
|
<dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
|
||||||
|
<dri:dateOfCollection/>
|
||||||
|
<dri:mdFormat/>
|
||||||
|
<dri:mdFormatInterpretation/>
|
||||||
|
<dri:repositoryId/>
|
||||||
|
<dr:objectIdentifier/>
|
||||||
|
<dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
|
||||||
|
<dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
|
||||||
|
<oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
|
||||||
|
</header>
|
||||||
|
<metadata xmlns="http://namespace.openaire.eu/">
|
||||||
|
<dc:title>多項式GCDを用いた復号法に関する研究</dc:title>
|
||||||
|
<dc:creator>上原, 剛</dc:creator>
|
||||||
|
<dc:creator>甲斐, 博</dc:creator>
|
||||||
|
<dc:creator>野田, 松太郎</dc:creator>
|
||||||
|
<dc:format>application/pdf</dc:format>
|
||||||
|
<dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
|
||||||
|
<dc:language>jpn</dc:language>
|
||||||
|
<dc:publisher>京都大学数理解析研究所</dc:publisher>
|
||||||
|
<dc:subject classid="ndc" classname="ndc"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
|
||||||
|
<dc:type>Departmental Bulletin Paper</dc:type>
|
||||||
|
<dr:CobjCategory type="publication">0014</dr:CobjCategory>
|
||||||
|
<oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
|
||||||
|
<oaf:projectid/>
|
||||||
|
<oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
|
||||||
|
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||||
|
<oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
|
||||||
|
<oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
|
||||||
|
<oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
|
||||||
|
<oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
|
||||||
|
<oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
|
||||||
|
<oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
|
||||||
|
<oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
|
||||||
|
</metadata>
|
||||||
|
<about>
|
||||||
|
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||||
|
<originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
|
||||||
|
<baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
|
||||||
|
<identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
|
||||||
|
<datestamp>2021-04-13T13:36:29Z</datestamp>
|
||||||
|
<metadataNamespace/>
|
||||||
|
<originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
|
||||||
|
<baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
|
||||||
|
<identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
|
||||||
|
<datestamp>2012-07-12T14:15:41Z</datestamp>
|
||||||
|
<metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
|
||||||
|
</originDescription>
|
||||||
|
</originDescription>
|
||||||
|
</provenance>
|
||||||
|
<oaf:datainfo>
|
||||||
|
<oaf:inferred>false</oaf:inferred>
|
||||||
|
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||||
|
<oaf:trust>0.9</oaf:trust>
|
||||||
|
<oaf:inferenceprovenance/>
|
||||||
|
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
|
||||||
|
classname="sysimport:crosswalk:repository"
|
||||||
|
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||||
|
</oaf:datainfo>
|
||||||
|
</about>
|
||||||
|
</record>
|
5
pom.xml
5
pom.xml
|
@ -205,6 +205,11 @@
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
<version>1.0.7</version>
|
<version>1.0.7</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>me.xuender</groupId>
|
||||||
|
<artifactId>unidecode</artifactId>
|
||||||
|
<version>0.0.7</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.guava</groupId>
|
<groupId>com.google.guava</groupId>
|
||||||
|
|
Loading…
Reference in New Issue