merging with branch beta

This commit is contained in:
Miriam Baglioni 2024-08-12 18:03:10 +02:00
commit 45605f93ae
7 changed files with 48 additions and 12 deletions

View File

@ -129,7 +129,8 @@ public class PrepareAffiliationRelations implements Serializable {
Dataset<Row> df = spark Dataset<Row> df = spark
.read() .read()
.schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>") .schema("`DOI` STRING, `Matchings` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
.json(inputPath); .json(inputPath)
.where("DOI is not null");
// unroll nested arrays // unroll nested arrays
df = df df = df

View File

@ -31,5 +31,11 @@ class ORCIDAuthorMatchersTest {
assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin")) assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin"))
// assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented
} }
@Test def testDocumentationNames(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones"))
}
@Test def testDocumentationNames2(): Unit = {
assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller Jones"))
}
} }

View File

@ -698,6 +698,7 @@ public class ProvisionModelSupport {
.stream() .stream()
.filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> Objects.nonNull(s.getQualifier().getClassname())) .filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
.filter(ProvisionModelSupport::filterFosL1L2)
.map( .map(
s -> Subject s -> Subject
.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname())) .newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
@ -720,6 +721,16 @@ public class ProvisionModelSupport {
.orElse(null); .orElse(null);
} }
public static boolean filterFosL1L2(StructuredProperty s) {
final String subjectType = Optional.ofNullable(s.getQualifier()).map(Qualifier::getClassid).orElse("");
if (ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(subjectType)) {
String code = StringUtils.substringBefore(s.getValue(), " ");
return code.matches("^\\d{2}$|^\\d{4}$");
}
return true;
}
private static Country asCountry(eu.dnetlib.dhp.schema.oaf.Qualifier country) { private static Country asCountry(eu.dnetlib.dhp.schema.oaf.Qualifier country) {
return Optional return Optional
.ofNullable(country) .ofNullable(country)

View File

@ -20,6 +20,7 @@ import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource; import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import eu.dnetlib.dhp.oa.provision.model.*;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Pair;
@ -41,10 +42,6 @@ import com.google.common.collect.Sets;
import com.mycila.xmltool.XMLDoc; import com.mycila.xmltool.XMLDoc;
import com.mycila.xmltool.XMLTag; import com.mycila.xmltool.XMLTag;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.oa.provision.model.XmlInstance;
import eu.dnetlib.dhp.schema.common.*; import eu.dnetlib.dhp.schema.common.*;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
@ -389,6 +386,7 @@ public class XmlRecordFactory implements Serializable {
.getSubject() .getSubject()
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(ProvisionModelSupport::filterFosL1L2)
.map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }

View File

@ -5,11 +5,7 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.commons.lang3.StringUtils.isBlank;
import static org.apache.commons.lang3.StringUtils.isNotBlank; import static org.apache.commons.lang3.StringUtils.isNotBlank;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;

View File

@ -97,6 +97,8 @@ public class XmlRecordFactoryTest {
assertEquals("bronze", doc.valueOf("//*[local-name() = 'result']/openaccesscolor/text()")); assertEquals("bronze", doc.valueOf("//*[local-name() = 'result']/openaccesscolor/text()"));
assertEquals("true", doc.valueOf("//*[local-name() = 'result']/isindiamondjournal/text()")); assertEquals("true", doc.valueOf("//*[local-name() = 'result']/isindiamondjournal/text()"));
assertEquals("true", doc.valueOf("//*[local-name() = 'result']/publiclyfunded/text()")); assertEquals("true", doc.valueOf("//*[local-name() = 'result']/publiclyfunded/text()"));
assertEquals(15, doc.selectNodes("//*[local-name() = 'result']/*[local-name() = 'subject']").size());
} }
@Test @Test

View File

@ -1886,12 +1886,34 @@
"trust": "" "trust": ""
}, },
"qualifier": { "qualifier": {
"classid": "keyword", "classid": "FOS",
"classname": "keyword", "classname": "Fields of Science and Technology classification",
"schemeid": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies" "schemename": "dnet:subject_classification_typologies"
}, },
"value": "Thermal conductivity" "value": "0101 mathematics"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"trust": ""
},
"qualifier": {
"classid": "FOS",
"classname": "Fields of Science and Technology classification",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "010101 applied mathematics"
} }
], ],
"title": [ "title": [