Merge branch 'stable_ids' of code-repo.d4science.org:D-Net/dnet-hadoop into stable_ids

This commit is contained in:
Sandro La Bruzzo 2021-03-17 12:13:03 +01:00
commit 7c97a4d900
8 changed files with 242 additions and 42 deletions

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import static com.google.common.base.Preconditions.checkArgument;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.io.Serializable; import java.io.Serializable;
@ -54,6 +55,10 @@ public class IdentifierFactory implements Serializable {
PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive"); PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive");
} }
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
return pidFromInstance(pid, collectedFrom).collect(Collectors.toList());
}
/** /**
* Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given * Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given
* entity T. Returns entity.id when none of the PIDs meet the selection criteria is available. * entity T. Returns entity.id when none of the PIDs meet the selection criteria is available.
@ -65,6 +70,8 @@ public class IdentifierFactory implements Serializable {
*/ */
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) { public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
final Map<String, List<StructuredProperty>> pids = extractPids(entity); final Map<String, List<StructuredProperty>> pids = extractPids(entity);
return pids return pids
@ -91,37 +98,7 @@ public class IdentifierFactory implements Serializable {
return Optional return Optional
.ofNullable(((Result) entity).getInstance()) .ofNullable(((Result) entity).getInstance())
.map( .map(
instance -> instance instance -> mapPids(instance))
.stream()
.map(
i -> Optional
.ofNullable(i.getPid())
.map(
pp -> pp
.stream()
// filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type
.filter(p -> {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
return Optional.ofNullable(i.getCollectedfrom()).isPresent() &&
Optional
.ofNullable(PID_AUTHORITY.get(pType))
.map(authorities -> {
final KeyValue cf = i.getCollectedfrom();
return authorities.containsKey(cf.getKey())
|| authorities.containsValue(cf.getValue());
})
.orElse(false);
})
.map(CleaningFunctions::normalizePidValue)
.filter(IdentifierFactory::pidFilter))
.orElse(Stream.empty()))
.flatMap(Function.identity())
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toList()))))
.orElse(new HashMap<>()); .orElse(new HashMap<>());
} else { } else {
return entity return entity
@ -137,6 +114,42 @@ public class IdentifierFactory implements Serializable {
} }
} }
private static Map<String, List<StructuredProperty>> mapPids(List<Instance> instance) {
return instance
.stream()
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom()))
.flatMap(Function.identity())
.collect(
Collectors
.groupingBy(
p -> p.getQualifier().getClassid(),
Collectors.mapping(p -> p, Collectors.toList())));
}
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom) {
return Optional
.ofNullable(pid)
.map(
pp -> pp
.stream()
// filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type
.filter(p -> {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
return Optional.ofNullable(collectedFrom).isPresent() &&
Optional
.ofNullable(PID_AUTHORITY.get(pType))
.map(authorities -> {
return authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue());
})
.orElse(false);
})
.map(CleaningFunctions::normalizePidValue)
.filter(IdentifierFactory::pidFilter))
.orElse(Stream.empty());
}
/** /**
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)} * @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
*/ */

View File

@ -23,6 +23,8 @@ public class Instance implements Serializable {
private List<StructuredProperty> pid; private List<StructuredProperty> pid;
private List<StructuredProperty> alternateIdentifier;
private Field<String> dateofacceptance; private Field<String> dateofacceptance;
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed // ( article | book ) processing charges. Defined here to cope with possible wrongly typed
@ -107,6 +109,14 @@ public class Instance implements Serializable {
this.dateofacceptance = dateofacceptance; this.dateofacceptance = dateofacceptance;
} }
public List<StructuredProperty> getAlternateIdentifier() {
return alternateIdentifier;
}
public void setAlternateIdentifier(List<StructuredProperty> alternateIdentifier) {
this.alternateIdentifier = alternateIdentifier;
}
public Field<String> getProcessingchargeamount() { public Field<String> getProcessingchargeamount() {
return processingchargeamount; return processingchargeamount;
} }
@ -159,4 +169,5 @@ public class Instance implements Serializable {
return toComparableString().equals(other.toComparableString()); return toComparableString().equals(other.toComparableString());
} }
} }

View File

@ -2,6 +2,13 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable; import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner;
public class StructuredProperty implements Serializable { public class StructuredProperty implements Serializable {
@ -36,7 +43,12 @@ public class StructuredProperty implements Serializable {
} }
public String toComparableString() { public String toComparableString() {
return value != null ? value.toLowerCase() : ""; return Stream
.of(
getQualifier().toComparableString(),
Optional.ofNullable(getValue()).map(String::toLowerCase).orElse(""))
.filter(StringUtils::isNotBlank)
.collect(Collectors.joining("||"));
} }
@Override @Override

View File

@ -284,8 +284,8 @@ public abstract class AbstractMdRecordToOafMapper {
r.setCollectedfrom(Arrays.asList(collectedFrom)); r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareResultPids(doc, info)); r.setPid(prepareResultPids(doc, info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection|//dri:dateOfCollection")); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection/text()|//dri:dateOfCollection/text()"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation|//dri:dateOfTransformation")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation/text()|//dri:dateOfTransformation/text()"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc)); r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(prepareAuthors(doc, info)); r.setAuthor(prepareAuthors(doc, info));

View File

@ -5,7 +5,9 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -19,6 +21,7 @@ import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@ -125,7 +128,17 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
instance.setCollectedfrom(collectedfrom); instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby); instance.setHostedby(hostedby);
instance.setPid(prepareResultPids(doc, info));
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
instance
.setAlternateIdentifier(
alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList()));
instance.setPid(pid);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance instance

View File

@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
import java.util.*; import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -14,6 +15,7 @@ import org.dom4j.Node;
import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@ -102,7 +104,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
instance.setCollectedfrom(collectedfrom); instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby); instance.setHostedby(hostedby);
instance.setPid(prepareResultPids(doc, info));
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
instance
.setAlternateIdentifier(
alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList()));
instance.setPid(pid);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
instance instance

View File

@ -123,9 +123,11 @@ public class MappersTest {
}); });
assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid()); assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid());
assertNotNull(p.getInstance().get(0).getPid()); assertNotNull(p.getInstance().get(0).getPid());
assertTrue(p.getInstance().get(0).getPid().size() == 1); assertTrue(p.getInstance().get(0).getPid().isEmpty());
assertEquals("doi", p.getInstance().get(0).getPid().get(0).getQualifier().getClassid());
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getPid().get(0).getValue()); assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty());
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
assertNotNull(p.getBestaccessright()); assertNotNull(p.getBestaccessright());
assertEquals("OPEN", p.getBestaccessright().getClassid()); assertEquals("OPEN", p.getBestaccessright().getClassid());
@ -154,6 +156,78 @@ public class MappersTest {
// System.out.println(new ObjectMapper().writeValueAsString(r2)); // System.out.println(new ObjectMapper().writeValueAsString(r2));
} }
@Test
void testPublication_PubMed() throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record_pubmed.xml"));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Publication);
final Publication p = (Publication) list.get(0);
assertValidId(p.getId());
assertEquals(2, p.getOriginalId().size());
assertTrue(p.getOriginalId().contains("oai:pubmedcentral.nih.gov:1517292"));
assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
assertFalse(p.getDataInfo().getInvisible());
assertTrue(StringUtils.isNotBlank(p.getDateofcollection()));
assertTrue(StringUtils.isNotBlank(p.getDateoftransformation()));
assertTrue(p.getAuthor().size() > 0);
final Optional<Author> author = p
.getAuthor()
.stream()
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.findFirst();
assertTrue(author.isPresent());
final StructuredProperty pid = author
.get()
.getPid()
.stream()
.findFirst()
.get();
assertEquals("0000-0001-6651-1178", pid.getValue());
assertEquals("ORCID", pid.getQualifier().getClassid());
assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname());
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
assertEquals("Votsi,Nefta", author.get().getFullname());
assertEquals("Votsi", author.get().getSurname());
assertEquals("Nefta", author.get().getName());
assertTrue(p.getSubject().size() > 0);
assertTrue(p.getPid().size() > 0);
assertEquals(p.getPid().get(0).getValue(), "PMC1517292");
assertEquals(p.getPid().get(0).getQualifier().getClassid(), "pmc");
assertNotNull(p.getInstance());
assertTrue(p.getInstance().size() > 0);
p
.getInstance()
.stream()
.forEach(i -> {
assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid());
});
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
assertNotNull(p.getInstance().get(0).getPid());
assertTrue(p.getInstance().get(0).getPid().size() == 2);
assertTrue(p.getInstance().get(0).getAlternateIdentifier().size() == 1);
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
assertNotNull(p.getBestaccessright());
assertEquals("OPEN", p.getBestaccessright().getClassid());
}
@Test @Test
void testPublicationInvisible() throws IOException { void testPublicationInvisible() throws IOException {
@ -239,9 +313,10 @@ public class MappersTest {
}); });
assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid()); assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid());
assertNotNull(d.getInstance().get(0).getPid()); assertNotNull(d.getInstance().get(0).getPid());
assertTrue(d.getInstance().get(0).getPid().size() == 1); assertTrue(d.getInstance().get(0).getPid().isEmpty());
assertEquals("doi", d.getInstance().get(0).getPid().get(0).getQualifier().getClassid());
assertEquals("10.5281/zenodo.3234526", d.getInstance().get(0).getPid().get(0).getValue()); assertEquals("doi", d.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
assertEquals("10.5281/zenodo.3234526", d.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
assertValidId(r1.getSource()); assertValidId(r1.getSource());
assertValidId(r1.getTarget()); assertValidId(r1.getTarget());

View File

@ -0,0 +1,64 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>od_______267::0000072375bc0e68fa09d4e6b7658248</dri:objIdentifier>
<dri:recordIdentifier>oai:pubmedcentral.nih.gov:1517292</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection>2020-08-03T18:38:58Z</dr:dateOfCollection>
<dr:dateOfTransformation>2020-08-03T19:38:58Z</dr:dateOfTransformation>
<oaf:datasourceprefix>od_______267</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:title>DEATHS</dc:title>
<dc:creator>Nikolaidou,Charitini</dc:creator>
<dc:creator nameIdentifier="0000-0001-6651-1178" nameIdentifierScheme="ORCID">Votsi,Nefta</dc:creator>
<dc:creator>Sgardelis,Steanos</dc:creator>
<dc:creator>Halley,John</dc:creator>
<dc:creator>Pantis,John</dc:creator>
<dc:creator>Tsiafouli,Maria</dc:creator>
<dc:date>1922-07</dc:date>
<dc:description/>
<dc:identifier>https://europepmc.org/articles/PMC1517292/</dc:identifier>
<dc:language>eng</dc:language>
<dc:subject>Articles</dc:subject>
<dc:type>Text</dc:type>
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
<dr:CobjIdentifier/>
<oaf:dateAccepted>1922-07-01</oaf:dateAccepted>
<oaf:collectedDatasourceid>opendoar____::267</oaf:collectedDatasourceid>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy id="opendoar____::908" name="Europe PubMed Central"/>
<oaf:collectedFrom id="opendoar____::267" name="PubMed Central"/>
<oaf:identifier identifierType="pmc">PMC1517292</oaf:identifier>
<oaf:identifier identifierType="pmid">18738762</oaf:identifier>
<oaf:identifier identifierType="doi">10.3897/oneeco.2.e13718</oaf:identifier>
</metadata>
<about>
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2018-07-13T13:07:32.339Z">
<baseURL>https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi</baseURL>
<identifier>oai:pubmedcentral.nih.gov:1517292</identifier>
<datestamp>2006-08-14</datestamp>
<metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
classname="sysimport:crosswalk:repository"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>