orcid-no-doi #98

Closed
enrico.ottonello wants to merge 34 commits from orcid-no-doi into stable_ids
3 changed files with 28 additions and 3 deletions
Showing only changes of commit bd3b16402b - Show all commits

View File

@ -8,7 +8,6 @@ import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
@ -38,6 +37,7 @@ import eu.dnetlib.dhp.schema.orcid.AuthorSummary;
import eu.dnetlib.dhp.schema.orcid.Work; import eu.dnetlib.dhp.schema.orcid.Work;
import eu.dnetlib.dhp.schema.orcid.WorkDetail; import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcid.json.JsonHelper; import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcid.util.HDFSUtil;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf; import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
import scala.Tuple2; import scala.Tuple2;
@ -137,6 +137,8 @@ public class SparkGenEnrichedOrcidWorks {
.sparkContext() .sparkContext()
.longAccumulator("errorsNotFoundAuthors"); .longAccumulator("errorsNotFoundAuthors");
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType"); final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
final LongAccumulator otherTypeFound = spark.sparkContext().longAccumulator("otherTypeFound");
final PublicationToOaf publicationToOaf = new PublicationToOaf( final PublicationToOaf publicationToOaf = new PublicationToOaf(
parsedPublications, parsedPublications,
enrichedPublications, enrichedPublications,
@ -144,7 +146,8 @@ public class SparkGenEnrichedOrcidWorks {
errorsInvalidTitle, errorsInvalidTitle,
errorsNotFoundAuthors, errorsNotFoundAuthors,
errorsInvalidType, errorsInvalidType,
dateOfCollection); otherTypeFound,
dateOfCollection);
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
.map( .map(
e -> { e -> {
@ -173,6 +176,7 @@ public class SparkGenEnrichedOrcidWorks {
logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString()); logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString()); logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString()); logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
logger.info("otherTypeFound: " + otherTypeFound.value().toString());
}); });
} }
} }

View File

@ -43,6 +43,7 @@ public class PublicationToOaf implements Serializable {
private final LongAccumulator errorsInvalidTitle; private final LongAccumulator errorsInvalidTitle;
private final LongAccumulator errorsNotFoundAuthors; private final LongAccumulator errorsNotFoundAuthors;
private final LongAccumulator errorsInvalidType; private final LongAccumulator errorsInvalidType;
private final LongAccumulator otherTypeFound;
public PublicationToOaf( public PublicationToOaf(
LongAccumulator parsedPublications, LongAccumulator parsedPublications,
@ -51,6 +52,7 @@ public class PublicationToOaf implements Serializable {
LongAccumulator errorsInvalidTitle, LongAccumulator errorsInvalidTitle,
LongAccumulator errorsNotFoundAuthors, LongAccumulator errorsNotFoundAuthors,
LongAccumulator errorsInvalidType, LongAccumulator errorsInvalidType,
LongAccumulator otherTypeFound,
String dateOfCollection) { String dateOfCollection) {
this.parsedPublications = parsedPublications; this.parsedPublications = parsedPublications;
this.enrichedPublications = enrichedPublications; this.enrichedPublications = enrichedPublications;
@ -58,6 +60,7 @@ public class PublicationToOaf implements Serializable {
this.errorsInvalidTitle = errorsInvalidTitle; this.errorsInvalidTitle = errorsInvalidTitle;
this.errorsNotFoundAuthors = errorsNotFoundAuthors; this.errorsNotFoundAuthors = errorsNotFoundAuthors;
this.errorsInvalidType = errorsInvalidType; this.errorsInvalidType = errorsInvalidType;
this.otherTypeFound = otherTypeFound;
this.dateOfCollection = dateOfCollection; this.dateOfCollection = dateOfCollection;
} }
@ -68,6 +71,8 @@ public class PublicationToOaf implements Serializable {
this.errorsInvalidTitle = null; this.errorsInvalidTitle = null;
this.errorsNotFoundAuthors = null; this.errorsNotFoundAuthors = null;
this.errorsInvalidType = null; this.errorsInvalidType = null;
this.otherTypeFound = null;
this.dateOfCollection = null;
} }
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() { private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
@ -221,6 +226,14 @@ public class PublicationToOaf implements Serializable {
final String typeValue = typologiesMapping.get(type).get("value"); final String typeValue = typologiesMapping.get(type).get("value");
cobjValue = typologiesMapping.get(type).get("cobj"); cobjValue = typologiesMapping.get(type).get("cobj");
// this dataset must contain only publication
if (cobjValue.equals("0020")) {
if (otherTypeFound != null) {
otherTypeFound.add(1);
}
return null;
}
final Instance instance = new Instance(); final Instance instance = new Instance();
// Adding hostedby // Adding hostedby

View File

@ -31,5 +31,13 @@
"annotation": {"cobj":"0018", "value": "Annotation"}, "annotation": {"cobj":"0018", "value": "Annotation"},
"physical-object": {"cobj":"0028", "value": "PhysicalObject"}, "physical-object": {"cobj":"0028", "value": "PhysicalObject"},
"preprint": {"cobj":"0016", "value": "Preprint"}, "preprint": {"cobj":"0016", "value": "Preprint"},
"software": {"cobj":"0029", "value": "Software"} "software": {"cobj":"0029", "value": "Software"},
"journal-issue": {"cobj":"0001", "value": "Article"},
"translation": {"cobj":"0038", "value": "Other literature type"},
"artistic-performance": {"cobj":"0020", "value": "Other ORP type"},
"online-resource": {"cobj":"0020", "value": "Other ORP type"},
"registered-copyright": {"cobj":"0020", "value": "Other ORP type"},
"trademark": {"cobj":"0020", "value": "Other ORP type"},
"invention": {"cobj":"0020", "value": "Other ORP type"},
"spin-off-company": {"cobj":"0020", "value": "Other ORP type"}
} }