implemented scholix Flat mapping

This commit is contained in:
Sandro La Bruzzo 2023-04-12 11:52:41 +02:00
parent ef82b8362d
commit ab1842e5dc
8 changed files with 412 additions and 168 deletions

View File

@ -0,0 +1,158 @@
package eu.dnetlib.dhp.sx.graph.scholix;
import java.util.List;
public class ScholixFlat {
private String identifier;
private String relationType;
private String sourceId;
private String sourceType;
private String sourceSubType;
private List<String> sourcePid;
private List<String> sourcePidType;
private List<String> sourcePublisher;
private String targetId;
private String targetType;
private String targetSubType;
private List<String> targetPid;
private List<String> targetPidType;
private List<String> targetPublisher;
private List<String> linkProviders;
private String publicationDate;
private String blob;
public String getIdentifier() {
return identifier;
}
public void setIdentifier(String identifier) {
this.identifier = identifier;
}
public String getRelationType() {
return relationType;
}
public void setRelationType(String relationType) {
this.relationType = relationType;
}
public String getSourceId() {
return sourceId;
}
public void setSourceId(String sourceId) {
this.sourceId = sourceId;
}
public String getSourceType() {
return sourceType;
}
public void setSourceType(String sourceType) {
this.sourceType = sourceType;
}
public String getSourceSubType() {
return sourceSubType;
}
public void setSourceSubType(String sourceSubType) {
this.sourceSubType = sourceSubType;
}
public List<String> getSourcePid() {
return sourcePid;
}
public void setSourcePid(List<String> sourcePid) {
this.sourcePid = sourcePid;
}
public List<String> getSourcePidType() {
return sourcePidType;
}
public void setSourcePidType(List<String> sourcePidType) {
this.sourcePidType = sourcePidType;
}
public List<String> getSourcePublisher() {
return sourcePublisher;
}
public void setSourcePublisher(List<String> sourcePublisher) {
this.sourcePublisher = sourcePublisher;
}
public String getTargetId() {
return targetId;
}
public void setTargetId(String targetId) {
this.targetId = targetId;
}
public String getTargetType() {
return targetType;
}
public void setTargetType(String targetType) {
this.targetType = targetType;
}
public String getTargetSubType() {
return targetSubType;
}
public void setTargetSubType(String targetSubType) {
this.targetSubType = targetSubType;
}
public List<String> getTargetPid() {
return targetPid;
}
public void setTargetPid(List<String> targetPid) {
this.targetPid = targetPid;
}
public List<String> getTargetPidType() {
return targetPidType;
}
public void setTargetPidType(List<String> targetPidType) {
this.targetPidType = targetPidType;
}
public List<String> getTargetPublisher() {
return targetPublisher;
}
public void setTargetPublisher(List<String> targetPublisher) {
this.targetPublisher = targetPublisher;
}
public List<String> getLinkProviders() {
return linkProviders;
}
public void setLinkProviders(List<String> linkProviders) {
this.linkProviders = linkProviders;
}
public String getPublicationDate() {
return publicationDate;
}
public void setPublicationDate(String publicationDate) {
this.publicationDate = publicationDate;
}
public String getBlob() {
return blob;
}
public void setBlob(String blob) {
this.blob = blob;
}
}

View File

@ -1,14 +1,15 @@
package eu.dnetlib.dhp.sx.graph.scholix package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty} import eu.dnetlib.dhp.schema.oaf.{Dataset, OtherResearchProduct, Publication, Relation, Result, Software, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.scholix._ import eu.dnetlib.dhp.schema.sx.scholix._
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology} import eu.dnetlib.dhp.schema.sx.summary.{AuthorPid, CollectedFromType, SchemeValue, ScholixSummary, Typology}
import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders} import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s import org.json4s
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.Source import scala.io.Source
@ -59,6 +60,36 @@ object ScholixUtils extends Serializable {
} }
} }
def flattenizeScholix(input: Scholix, json: String): ScholixFlat = {
val flat: ScholixFlat = new ScholixFlat
flat.setIdentifier(input.getIdentifier)
flat.setRelationType(input.getRelationship.getName)
if (input.getSource != null && input.getSource.getIdentifier != null) {
flat.setSourceId(input.getSource.getDnetIdentifier)
flat.setSourcePid(input.getSource.getIdentifier.asScala.map(p => p.getIdentifier).distinct.toList.asJava)
flat.setSourcePidType(input.getSource.getIdentifier.asScala.map(p => p.getSchema).distinct.toList.asJava)
flat.setSourceType(input.getSource.getObjectType)
flat.setSourceSubType(input.getSource.getObjectSubType)
} else return null
if (input.getSource.getPublisher != null)
flat.setSourcePublisher(input.getSource.getPublisher.asScala.map(p => p.getName).toList.asJava)
if (input.getTarget != null && input.getTarget.getIdentifier != null) {
flat.setTargetId(input.getTarget.getDnetIdentifier)
flat.setTargetPid(input.getTarget.getIdentifier.asScala.map(p => p.getIdentifier).distinct.toList.asJava)
flat.setTargetPidType(input.getTarget.getIdentifier.asScala.map(p => p.getSchema).distinct.toList.asJava)
flat.setTargetType(input.getTarget.getObjectType)
flat.setTargetSubType(input.getTarget.getObjectSubType)
} else return null
if (input.getTarget.getPublisher != null)
flat.setTargetPublisher(input.getTarget.getPublisher.asScala.map(p => p.getName).distinct.toList.asJava)
flat.setPublicationDate(input.getPublicationDate)
if (input.getLinkprovider != null)
flat.setLinkProviders(input.getLinkprovider.asScala.map(l => l.getName).distinct.toList.asJava)
flat.setBlob(json);
flat
}
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = { def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName) new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
@ -232,7 +263,14 @@ object ScholixUtils extends Serializable {
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) { if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] = val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList summaryObject.getAuthor.asScala.map(a => {
if (a.getORCID != null)
new ScholixEntityId(
a.getFullname,
List(new ScholixIdentifier(a.getORCID, "ORCID", s"https://orcid.org/${a.getORCID}")).asJava
)
else new ScholixEntityId(a.getFullname, null)
}).toList
if (l.nonEmpty) if (l.nonEmpty)
r.setCreator(l.asJava) r.setCreator(l.asJava)
} }
@ -377,10 +415,13 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty) if (persistentIdentifiers.isEmpty)
return null return null
s.setLocalIdentifier(persistentIdentifiers.asJava) s.setLocalIdentifier(persistentIdentifiers.asJava)
if (r.isInstanceOf[Publication]) r match {
s.setTypology(Typology.publication) case _: Publication => s.setTypology(Typology.publication)
else case _: Dataset => s.setTypology(Typology.dataset)
s.setTypology(Typology.dataset) case _: Software => s.setTypology(Typology.software)
case _: OtherResearchProduct => s.setTypology(Typology.otherresearchproduct)
case _ =>
}
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
@ -393,7 +434,20 @@ object ScholixUtils extends Serializable {
} }
if (r.getAuthor != null && !r.getAuthor.isEmpty) { if (r.getAuthor != null && !r.getAuthor.isEmpty) {
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList val authors: List[AuthorPid] = r.getAuthor.asScala
.map(a => {
var ORCID: String = null
if (a.getPid != null) {
val result = a.getPid.asScala.find(p =>
p.getQualifier != null && p.getQualifier.getClassid != null && p.getQualifier.getClassid.toLowerCase
.contains("orcid")
)
if (result.isDefined)
ORCID = result.get.getValue
}
new AuthorPid(a.getFullname, ORCID)
})
.toList
if (authors.nonEmpty) if (authors.nonEmpty)
s.setAuthor(authors.asJava) s.setAuthor(authors.asJava)
} }

View File

@ -0,0 +1,42 @@
package eu.dnetlib.dhp.sx.graph.scholix;
import static org.junit.jupiter.api.Assertions.*;
import java.io.*;
import java.util.zip.GZIPInputStream;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.sx.scholix.Scholix;
public class ScholixFlatTest {
@Test
public void flattenScholixTest() throws IOException {
final ObjectMapper mapper = new ObjectMapper();
InputStream gzipStream = new GZIPInputStream(getClass().getResourceAsStream("scholix_records.gz"));
Reader decoder = new InputStreamReader(gzipStream, "UTF-8");
BufferedReader buffered = new BufferedReader(decoder);
String line;
FileWriter myWriter = new FileWriter("/Users/sandro/Downloads/records");
while ((line = buffered.readLine()) != null) {
final Scholix s = mapper.readValue(line, Scholix.class);
final ScholixFlat flat = ScholixUtils.flattenizeScholix(s, line);
assertNotNull(s);
assertNotNull(flat);
assertEquals(s.getIdentifier(), flat.getIdentifier());
assertEquals(s.getRelationship().getName(), flat.getRelationType());
assertEquals(s.getSource().getObjectType(), flat.getSourceType());
assertEquals(s.getSource().getObjectSubType(), flat.getSourceSubType());
myWriter.write(mapper.writeValueAsString(flat));
myWriter.write("\n");
}
myWriter.close();
}
}

View File

@ -1,194 +1,190 @@
package eu.dnetlib.dhp.sx.provision; package eu.dnetlib.dhp.sx.provision;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.sx.scholix.Scholix;
import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.sx.scholix.Scholix;
import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource;
public class ScholixFlat { public class ScholixFlat {
private static ObjectMapper MAPPER = new ObjectMapper(); private static ObjectMapper MAPPER = new ObjectMapper();
private List<String> linkProvider= new ArrayList<>(); private List<String> linkProvider = new ArrayList<>();
private String publicationDate; private String publicationDate;
private List<String> sourceLinkPublisher = new ArrayList<>(); private List<String> sourceLinkPublisher = new ArrayList<>();
private List<String> targetLinkPublisher = new ArrayList<>(); private List<String> targetLinkPublisher = new ArrayList<>();
private String sourceDnetIdentifier ; private String sourceDnetIdentifier;
private String targetDnetIdentifier ; private String targetDnetIdentifier;
private List<String> sourcePids = new ArrayList<>(); private List<String> sourcePids = new ArrayList<>();
private List<String> sourcePidTypes = new ArrayList<>(); private List<String> sourcePidTypes = new ArrayList<>();
private List<String> targetPids = new ArrayList<>(); private List<String> targetPids = new ArrayList<>();
private List<String> targetPidTypes = new ArrayList<>(); private List<String> targetPidTypes = new ArrayList<>();
private String json; private String json;
public void addLinkProvider(final String providerName) {
addStringToList(providerName, this.linkProvider);
}
public void addSourceLinkPublisher(final String linkPublisher) {
addStringToList(linkPublisher, sourceLinkPublisher);
public void addLinkProvider(final String providerName) { }
addStringToList(providerName, this.linkProvider);
}
public void addSourceLinkPublisher(final String linkPublisher) { public void addTargetLinkPublisher(final String linkPublisher) {
addStringToList(linkPublisher, sourceLinkPublisher); addStringToList(linkPublisher, targetLinkPublisher);
} }
public void addTargetLinkPublisher(final String linkPublisher) {
addStringToList(linkPublisher, targetLinkPublisher);
} public void addSourcePid(final String pid) {
addStringToList(pid, sourcePids);
}
public void addSourcePid(final String pid) { public void addSourcePidType(final String pidType) {
addStringToList(pid, sourcePids); addStringToList(pidType, sourcePidTypes);
} }
public void addSourcePidType(final String pidType) { public void addTargetPidType(final String pidType) {
addStringToList(pidType, sourcePidTypes); addStringToList(pidType, targetPidTypes);
} }
public void addTargetPidType(final String pidType) { public void addTargetPid(final String pid) {
addStringToList(pidType, targetPidTypes); addStringToList(pid, targetPids);
} }
public void addStringToList(final String s, final List<String> l) {
if (l != null && !l.contains(s))
l.add(s);
}
public void addTargetPid(final String pid) { public String getSourceDnetIdentifier() {
addStringToList(pid, targetPids); return sourceDnetIdentifier;
} }
public void addStringToList(final String s, final List<String>l ) { public void setSourceDnetIdentifier(String sourceDnetIdentifier) {
if (l!= null && !l.contains(s)) this.sourceDnetIdentifier = sourceDnetIdentifier;
l.add(s); }
}
public String getSourceDnetIdentifier() { public String getTargetDnetIdentifier() {
return sourceDnetIdentifier; return targetDnetIdentifier;
} }
public void setSourceDnetIdentifier(String sourceDnetIdentifier) { public void setTargetDnetIdentifier(String targetDnetIdentifier) {
this.sourceDnetIdentifier = sourceDnetIdentifier; this.targetDnetIdentifier = targetDnetIdentifier;
} }
public String getTargetDnetIdentifier() { public List<String> getSourcePids() {
return targetDnetIdentifier; return sourcePids;
} }
public void setTargetDnetIdentifier(String targetDnetIdentifier) { public void setSourcePids(List<String> sourcePids) {
this.targetDnetIdentifier = targetDnetIdentifier; this.sourcePids = sourcePids;
} }
public List<String> getSourcePids() { public List<String> getSourcePidTypes() {
return sourcePids; return sourcePidTypes;
} }
public void setSourcePids(List<String> sourcePids) { public void setSourcePidTypes(List<String> sourcePidTypes) {
this.sourcePids = sourcePids; this.sourcePidTypes = sourcePidTypes;
} }
public List<String> getSourcePidTypes() { public List<String> getTargetPids() {
return sourcePidTypes; return targetPids;
} }
public void setSourcePidTypes(List<String> sourcePidTypes) { public void setTargetPids(List<String> targetPids) {
this.sourcePidTypes = sourcePidTypes; this.targetPids = targetPids;
} }
public List<String> getTargetPids() { public List<String> getTargetPidTypes() {
return targetPids; return targetPidTypes;
} }
public void setTargetPids(List<String> targetPids) { public void setTargetPidTypes(List<String> targetPidTypes) {
this.targetPids = targetPids; this.targetPidTypes = targetPidTypes;
} }
public List<String> getTargetPidTypes() { public List<String> getSourceLinkPublisher() {
return targetPidTypes; return sourceLinkPublisher;
} }
public void setTargetPidTypes(List<String> targetPidTypes) { public void setSourceLinkPublisher(List<String> sourceLinkPublisher) {
this.targetPidTypes = targetPidTypes; this.sourceLinkPublisher = sourceLinkPublisher;
} }
public List<String> getSourceLinkPublisher() { public List<String> getTargetLinkPublisher() {
return sourceLinkPublisher; return targetLinkPublisher;
} }
public void setSourceLinkPublisher(List<String> sourceLinkPublisher) { public void setTargetLinkPublisher(List<String> targetLinkPublisher) {
this.sourceLinkPublisher = sourceLinkPublisher; this.targetLinkPublisher = targetLinkPublisher;
} }
public List<String> getTargetLinkPublisher() { public List<String> getLinkProvider() {
return targetLinkPublisher; return linkProvider;
} }
public void setTargetLinkPublisher(List<String> targetLinkPublisher) { public void setLinkProvider(List<String> linkProvider) {
this.targetLinkPublisher = targetLinkPublisher; this.linkProvider = linkProvider;
} }
public String getPublicationDate() {
return publicationDate;
}
public List<String> getLinkProvider() { public void setPublicationDate(String publicationDate) {
return linkProvider; this.publicationDate = publicationDate;
} }
public void setLinkProvider(List<String> linkProvider) { public String getJson() {
this.linkProvider = linkProvider; return json;
} }
public String getPublicationDate() { public void setJson(String json) {
return publicationDate; this.json = json;
} }
public void setPublicationDate(String publicationDate) { public static ScholixFlat fromScholix(final Scholix scholix) throws JsonProcessingException {
this.publicationDate = publicationDate; if (scholix == null || scholix.getSource() == null || scholix.getTarget() == null)
} return null;
final ScholixFlat flat = new ScholixFlat();
if (scholix.getLinkprovider() != null)
scholix.getLinkprovider().forEach(l -> flat.addLinkProvider(l.getName()));
public String getJson() { flat.setPublicationDate(scholix.getPublicationDate());
return json;
}
public void setJson(String json) { final ScholixResource source = scholix.getSource();
this.json = json; flat.setSourceDnetIdentifier(source.getDnetIdentifier());
} if (source.getIdentifier() != null) {
source.getIdentifier().forEach(i -> {
flat.addSourcePid(i.getIdentifier());
flat.addSourcePidType(i.getSchema());
});
}
if (source.getPublisher() != null) {
source.getPublisher().forEach(p -> flat.addSourceLinkPublisher(p.getName()));
}
public static ScholixFlat fromScholix(final Scholix scholix) throws JsonProcessingException { final ScholixResource target = scholix.getSource();
if (scholix== null || scholix.getSource()==null || scholix.getTarget()== null) flat.setTargetDnetIdentifier(target.getDnetIdentifier());
return null; if (target.getIdentifier() != null) {
final ScholixFlat flat = new ScholixFlat(); target.getIdentifier().forEach(i -> {
if (scholix.getLinkprovider()!= null) flat.addTargetPid(i.getIdentifier());
scholix.getLinkprovider().forEach(l ->flat.addLinkProvider(l.getName())); flat.addTargetPidType(i.getSchema());
});
flat.setPublicationDate(scholix.getPublicationDate()); }
if (target.getPublisher() != null) {
final ScholixResource source = scholix.getSource(); target.getPublisher().forEach(p -> flat.addTargetLinkPublisher(p.getName()));
flat.setSourceDnetIdentifier(source.getDnetIdentifier()); }
if (source.getIdentifier()!= null) { flat.setJson(MAPPER.writeValueAsString(scholix));
source.getIdentifier().forEach(i -> { return flat;
flat.addSourcePid(i.getIdentifier()); }
flat.addSourcePidType(i.getSchema());
});
}
if (source.getPublisher()!= null) {
source.getPublisher().forEach(p -> flat.addSourceLinkPublisher(p.getName()));
}
final ScholixResource target = scholix.getSource();
flat.setTargetDnetIdentifier(target.getDnetIdentifier());
if (target.getIdentifier()!= null) {
target.getIdentifier().forEach(i -> {
flat.addTargetPid(i.getIdentifier());
flat.addTargetPidType(i.getSchema());
});
}
if (target.getPublisher()!= null) {
target.getPublisher().forEach(p -> flat.addTargetLinkPublisher(p.getName()));
}
flat.setJson(MAPPER.writeValueAsString(scholix));
return flat;
}
} }

View File

@ -56,7 +56,7 @@ public class XmlRecordFactoryTest {
assertNotNull(doc); assertNotNull(doc);
//System.out.println(doc.asXML()); // System.out.println(doc.asXML());
assertEquals("0000-0001-9613-6638", doc.valueOf("//creator[@rank = '1']/@orcid")); assertEquals("0000-0001-9613-6638", doc.valueOf("//creator[@rank = '1']/@orcid"));
assertEquals("0000-0001-9613-6639", doc.valueOf("//creator[@rank = '1']/@orcid_pending")); assertEquals("0000-0001-9613-6639", doc.valueOf("//creator[@rank = '1']/@orcid_pending"));

View File

@ -6,19 +6,17 @@ import org.junit.Before
import org.junit.jupiter.api.{Test} import org.junit.jupiter.api.{Test}
class ScholixFlatTest{ class ScholixFlatTest {
var spark:SparkSession = null
var spark: SparkSession = null
def initSpark(): Unit = { def initSpark(): Unit = {
if (spark!= null) if (spark != null)
return return
println("SONO QUI") println("SONO QUI")
val conf = new SparkConf val conf = new SparkConf
conf.setAppName(getClass.getSimpleName ) conf.setAppName(getClass.getSimpleName)
conf.setMaster("local[*]") conf.setMaster("local[*]")
conf.set("spark.driver.host", "localhost") conf.set("spark.driver.host", "localhost")
conf.set("hive.metastore.local", "true") conf.set("hive.metastore.local", "true")
@ -27,26 +25,22 @@ class ScholixFlatTest{
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(getClass.getSimpleName) .appName(getClass.getSimpleName)
.config(conf) .config(conf)
.getOrCreate() .getOrCreate()
} }
def after(): Unit = { def after(): Unit = {
spark.stop() spark.stop()
} }
@Test @Test
def testScholixConversion (): Unit = { def testScholixConversion(): Unit = {
initSpark() initSpark()
val p = getClass.getResource("/eu/dnetlib/dhp/sx/provision/scholix_dump.zip").getPath val p = getClass.getResource("/eu/dnetlib/dhp/sx/provision/scholix_dump.zip").getPath
val t = spark.read.text(p).count val t = spark.read.text(p).count
println(s"total =$t") println(s"total =$t")
} }
} }

View File

@ -807,7 +807,7 @@
<mockito-core.version>3.3.3</mockito-core.version> <mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version> <mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version> <vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[3.16.0]</dhp-schemas.version> <dhp-schemas.version>[3.16.1-SNAPSHOT]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version> <dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version> <dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>