forked from antonis.lempesis/dnet-hadoop
Merge branch 'stable_ids' of code-repo.d4science.org:D-Net/dnet-hadoop into stable_ids
This commit is contained in:
commit
7c97a4d900
|
@ -1,6 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -54,6 +55,10 @@ public class IdentifierFactory implements Serializable {
|
|||
PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive");
|
||||
}
|
||||
|
||||
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
|
||||
return pidFromInstance(pid, collectedFrom).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given
|
||||
* entity T. Returns entity.id when none of the PIDs meet the selection criteria is available.
|
||||
|
@ -65,6 +70,8 @@ public class IdentifierFactory implements Serializable {
|
|||
*/
|
||||
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
|
||||
|
||||
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
|
||||
|
||||
final Map<String, List<StructuredProperty>> pids = extractPids(entity);
|
||||
|
||||
return pids
|
||||
|
@ -91,37 +98,7 @@ public class IdentifierFactory implements Serializable {
|
|||
return Optional
|
||||
.ofNullable(((Result) entity).getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.map(
|
||||
i -> Optional
|
||||
.ofNullable(i.getPid())
|
||||
.map(
|
||||
pp -> pp
|
||||
.stream()
|
||||
// filter away PIDs provided by a DS that is not considered an authority for the
|
||||
// given PID Type
|
||||
.filter(p -> {
|
||||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
||||
return Optional.ofNullable(i.getCollectedfrom()).isPresent() &&
|
||||
Optional
|
||||
.ofNullable(PID_AUTHORITY.get(pType))
|
||||
.map(authorities -> {
|
||||
final KeyValue cf = i.getCollectedfrom();
|
||||
return authorities.containsKey(cf.getKey())
|
||||
|| authorities.containsValue(cf.getValue());
|
||||
})
|
||||
.orElse(false);
|
||||
})
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.filter(IdentifierFactory::pidFilter))
|
||||
.orElse(Stream.empty()))
|
||||
.flatMap(Function.identity())
|
||||
.collect(
|
||||
Collectors
|
||||
.groupingBy(
|
||||
p -> p.getQualifier().getClassid(),
|
||||
Collectors.mapping(p -> p, Collectors.toList()))))
|
||||
instance -> mapPids(instance))
|
||||
.orElse(new HashMap<>());
|
||||
} else {
|
||||
return entity
|
||||
|
@ -137,6 +114,42 @@ public class IdentifierFactory implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
private static Map<String, List<StructuredProperty>> mapPids(List<Instance> instance) {
|
||||
return instance
|
||||
.stream()
|
||||
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom()))
|
||||
.flatMap(Function.identity())
|
||||
.collect(
|
||||
Collectors
|
||||
.groupingBy(
|
||||
p -> p.getQualifier().getClassid(),
|
||||
Collectors.mapping(p -> p, Collectors.toList())));
|
||||
}
|
||||
|
||||
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom) {
|
||||
return Optional
|
||||
.ofNullable(pid)
|
||||
.map(
|
||||
pp -> pp
|
||||
.stream()
|
||||
// filter away PIDs provided by a DS that is not considered an authority for the
|
||||
// given PID Type
|
||||
.filter(p -> {
|
||||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
||||
return Optional.ofNullable(collectedFrom).isPresent() &&
|
||||
Optional
|
||||
.ofNullable(PID_AUTHORITY.get(pType))
|
||||
.map(authorities -> {
|
||||
return authorities.containsKey(collectedFrom.getKey())
|
||||
|| authorities.containsValue(collectedFrom.getValue());
|
||||
})
|
||||
.orElse(false);
|
||||
})
|
||||
.map(CleaningFunctions::normalizePidValue)
|
||||
.filter(IdentifierFactory::pidFilter))
|
||||
.orElse(Stream.empty());
|
||||
}
|
||||
|
||||
/**
|
||||
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
|
||||
*/
|
||||
|
|
|
@ -23,6 +23,8 @@ public class Instance implements Serializable {
|
|||
|
||||
private List<StructuredProperty> pid;
|
||||
|
||||
private List<StructuredProperty> alternateIdentifier;
|
||||
|
||||
private Field<String> dateofacceptance;
|
||||
|
||||
// ( article | book ) processing charges. Defined here to cope with possible wrongly typed
|
||||
|
@ -107,6 +109,14 @@ public class Instance implements Serializable {
|
|||
this.dateofacceptance = dateofacceptance;
|
||||
}
|
||||
|
||||
public List<StructuredProperty> getAlternateIdentifier() {
|
||||
return alternateIdentifier;
|
||||
}
|
||||
|
||||
public void setAlternateIdentifier(List<StructuredProperty> alternateIdentifier) {
|
||||
this.alternateIdentifier = alternateIdentifier;
|
||||
}
|
||||
|
||||
public Field<String> getProcessingchargeamount() {
|
||||
return processingchargeamount;
|
||||
}
|
||||
|
@ -159,4 +169,5 @@ public class Instance implements Serializable {
|
|||
|
||||
return toComparableString().equals(other.toComparableString());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,13 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
|
||||
public class StructuredProperty implements Serializable {
|
||||
|
||||
|
@ -36,7 +43,12 @@ public class StructuredProperty implements Serializable {
|
|||
}
|
||||
|
||||
public String toComparableString() {
|
||||
return value != null ? value.toLowerCase() : "";
|
||||
return Stream
|
||||
.of(
|
||||
getQualifier().toComparableString(),
|
||||
Optional.ofNullable(getValue()).map(String::toLowerCase).orElse(""))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.joining("||"));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -284,8 +284,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
||||
r.setPid(prepareResultPids(doc, info));
|
||||
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection|//dri:dateOfCollection"));
|
||||
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation|//dri:dateOfTransformation"));
|
||||
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection/text()|//dri:dateOfCollection/text()"));
|
||||
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation/text()|//dri:dateOfTransformation/text()"));
|
||||
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||
r.setOaiprovenance(prepareOAIprovenance(doc));
|
||||
r.setAuthor(prepareAuthors(doc, info));
|
||||
|
|
|
@ -5,7 +5,9 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
|||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -19,6 +21,7 @@ import eu.dnetlib.dhp.common.PacePerson;
|
|||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
|
@ -125,7 +128,17 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
instance.setPid(prepareResultPids(doc, info));
|
||||
|
||||
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
||||
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
||||
|
||||
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
instance
|
||||
.setAlternateIdentifier(
|
||||
alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList()));
|
||||
instance.setPid(pid);
|
||||
|
||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance
|
||||
|
|
|
@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
|||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -14,6 +15,7 @@ import org.dom4j.Node;
|
|||
import eu.dnetlib.dhp.common.PacePerson;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
|
||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
|
@ -102,7 +104,17 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
|
||||
instance.setCollectedfrom(collectedfrom);
|
||||
instance.setHostedby(hostedby);
|
||||
instance.setPid(prepareResultPids(doc, info));
|
||||
|
||||
final List<StructuredProperty> alternateIdentifier = prepareResultPids(doc, info);
|
||||
final List<StructuredProperty> pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom);
|
||||
|
||||
final Set<StructuredProperty> pids = pid.stream().collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
instance
|
||||
.setAlternateIdentifier(
|
||||
alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList()));
|
||||
instance.setPid(pid);
|
||||
|
||||
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
||||
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
||||
instance
|
||||
|
|
|
@ -123,9 +123,11 @@ public class MappersTest {
|
|||
});
|
||||
assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid());
|
||||
assertNotNull(p.getInstance().get(0).getPid());
|
||||
assertTrue(p.getInstance().get(0).getPid().size() == 1);
|
||||
assertEquals("doi", p.getInstance().get(0).getPid().get(0).getQualifier().getClassid());
|
||||
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getPid().get(0).getValue());
|
||||
assertTrue(p.getInstance().get(0).getPid().isEmpty());
|
||||
|
||||
assertTrue(!p.getInstance().get(0).getAlternateIdentifier().isEmpty());
|
||||
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
|
||||
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
|
||||
|
||||
assertNotNull(p.getBestaccessright());
|
||||
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||
|
@ -154,6 +156,78 @@ public class MappersTest {
|
|||
// System.out.println(new ObjectMapper().writeValueAsString(r2));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPublication_PubMed() throws IOException {
|
||||
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record_pubmed.xml"));
|
||||
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
|
||||
assertEquals(1, list.size());
|
||||
assertTrue(list.get(0) instanceof Publication);
|
||||
|
||||
final Publication p = (Publication) list.get(0);
|
||||
|
||||
assertValidId(p.getId());
|
||||
|
||||
assertEquals(2, p.getOriginalId().size());
|
||||
assertTrue(p.getOriginalId().contains("oai:pubmedcentral.nih.gov:1517292"));
|
||||
|
||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||
assertFalse(p.getDataInfo().getInvisible());
|
||||
assertTrue(StringUtils.isNotBlank(p.getDateofcollection()));
|
||||
assertTrue(StringUtils.isNotBlank(p.getDateoftransformation()));
|
||||
|
||||
assertTrue(p.getAuthor().size() > 0);
|
||||
final Optional<Author> author = p
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
|
||||
.findFirst();
|
||||
assertTrue(author.isPresent());
|
||||
|
||||
final StructuredProperty pid = author
|
||||
.get()
|
||||
.getPid()
|
||||
.stream()
|
||||
.findFirst()
|
||||
.get();
|
||||
assertEquals("0000-0001-6651-1178", pid.getValue());
|
||||
assertEquals("ORCID", pid.getQualifier().getClassid());
|
||||
assertEquals("Open Researcher and Contributor ID", pid.getQualifier().getClassname());
|
||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemeid());
|
||||
assertEquals(ModelConstants.DNET_PID_TYPES, pid.getQualifier().getSchemename());
|
||||
assertEquals("Votsi,Nefta", author.get().getFullname());
|
||||
assertEquals("Votsi", author.get().getSurname());
|
||||
assertEquals("Nefta", author.get().getName());
|
||||
|
||||
assertTrue(p.getSubject().size() > 0);
|
||||
assertTrue(p.getPid().size() > 0);
|
||||
assertEquals(p.getPid().get(0).getValue(), "PMC1517292");
|
||||
assertEquals(p.getPid().get(0).getQualifier().getClassid(), "pmc");
|
||||
|
||||
assertNotNull(p.getInstance());
|
||||
assertTrue(p.getInstance().size() > 0);
|
||||
p
|
||||
.getInstance()
|
||||
.stream()
|
||||
.forEach(i -> {
|
||||
assertNotNull(i.getAccessright());
|
||||
assertEquals("OPEN", i.getAccessright().getClassid());
|
||||
});
|
||||
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
|
||||
assertNotNull(p.getInstance().get(0).getPid());
|
||||
assertTrue(p.getInstance().get(0).getPid().size() == 2);
|
||||
|
||||
assertTrue(p.getInstance().get(0).getAlternateIdentifier().size() == 1);
|
||||
assertEquals("doi", p.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
|
||||
assertEquals("10.3897/oneeco.2.e13718", p.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
|
||||
|
||||
assertNotNull(p.getBestaccessright());
|
||||
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPublicationInvisible() throws IOException {
|
||||
|
||||
|
@ -239,9 +313,10 @@ public class MappersTest {
|
|||
});
|
||||
assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid());
|
||||
assertNotNull(d.getInstance().get(0).getPid());
|
||||
assertTrue(d.getInstance().get(0).getPid().size() == 1);
|
||||
assertEquals("doi", d.getInstance().get(0).getPid().get(0).getQualifier().getClassid());
|
||||
assertEquals("10.5281/zenodo.3234526", d.getInstance().get(0).getPid().get(0).getValue());
|
||||
assertTrue(d.getInstance().get(0).getPid().isEmpty());
|
||||
|
||||
assertEquals("doi", d.getInstance().get(0).getAlternateIdentifier().get(0).getQualifier().getClassid());
|
||||
assertEquals("10.5281/zenodo.3234526", d.getInstance().get(0).getAlternateIdentifier().get(0).getValue());
|
||||
|
||||
assertValidId(r1.getSource());
|
||||
assertValidId(r1.getTarget());
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header xmlns="http://namespace.openaire.eu/">
|
||||
<dri:objIdentifier>od_______267::0000072375bc0e68fa09d4e6b7658248</dri:objIdentifier>
|
||||
<dri:recordIdentifier>oai:pubmedcentral.nih.gov:1517292</dri:recordIdentifier>
|
||||
<dri:dateOfCollection/>
|
||||
<dri:mdFormat/>
|
||||
<dri:mdFormatInterpretation/>
|
||||
<dri:repositoryId/>
|
||||
<dr:objectIdentifier/>
|
||||
<dr:dateOfCollection>2020-08-03T18:38:58Z</dr:dateOfCollection>
|
||||
<dr:dateOfTransformation>2020-08-03T19:38:58Z</dr:dateOfTransformation>
|
||||
<oaf:datasourceprefix>od_______267</oaf:datasourceprefix>
|
||||
</header>
|
||||
<metadata xmlns="http://namespace.openaire.eu/">
|
||||
<dc:title>DEATHS</dc:title>
|
||||
<dc:creator>Nikolaidou,Charitini</dc:creator>
|
||||
<dc:creator nameIdentifier="0000-0001-6651-1178" nameIdentifierScheme="ORCID">Votsi,Nefta</dc:creator>
|
||||
<dc:creator>Sgardelis,Steanos</dc:creator>
|
||||
<dc:creator>Halley,John</dc:creator>
|
||||
<dc:creator>Pantis,John</dc:creator>
|
||||
<dc:creator>Tsiafouli,Maria</dc:creator>
|
||||
<dc:date>1922-07</dc:date>
|
||||
<dc:description/>
|
||||
<dc:identifier>https://europepmc.org/articles/PMC1517292/</dc:identifier>
|
||||
<dc:language>eng</dc:language>
|
||||
<dc:subject>Articles</dc:subject>
|
||||
<dc:type>Text</dc:type>
|
||||
<dr:CobjCategory type="publication">0038</dr:CobjCategory>
|
||||
<dr:CobjIdentifier/>
|
||||
<oaf:dateAccepted>1922-07-01</oaf:dateAccepted>
|
||||
<oaf:collectedDatasourceid>opendoar____::267</oaf:collectedDatasourceid>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:hostedBy id="opendoar____::908" name="Europe PubMed Central"/>
|
||||
<oaf:collectedFrom id="opendoar____::267" name="PubMed Central"/>
|
||||
<oaf:identifier identifierType="pmc">PMC1517292</oaf:identifier>
|
||||
<oaf:identifier identifierType="pmid">18738762</oaf:identifier>
|
||||
<oaf:identifier identifierType="doi">10.3897/oneeco.2.e13718</oaf:identifier>
|
||||
</metadata>
|
||||
<about>
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2018-07-13T13:07:32.339Z">
|
||||
<baseURL>https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi</baseURL>
|
||||
<identifier>oai:pubmedcentral.nih.gov:1517292</identifier>
|
||||
<datestamp>2006-08-14</datestamp>
|
||||
<metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
|
||||
classname="sysimport:crosswalk:repository"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</record>
|
Loading…
Reference in New Issue