[Annotation] extention of bulk tagging to accomodate graph annotation

This commit is contained in:
Miriam Baglioni 2024-08-05 17:08:52 +02:00
parent 740cfa77fb
commit b5130583e5
29 changed files with 340 additions and 181 deletions

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map; import java.util.Map;

View File

@ -0,0 +1,2 @@
sdgPath=/tmp/sdg_20240627_oaid_csv
outputPath=/tmp/miriam/sdgnodoi

View File

@ -0,0 +1,100 @@
{"affiliationId":{"schema":"RINGGOLD","value":"8244"},"departmentName":"Biology","endDate":"2019-05-19","orcid":"0000-0001-6291-9619","roleTitle":"Undergraduate Research Assistant","startDate":"2017-01-26"}
{"affiliationId":{"schema":"GRID","value":"grid.445941.9"},"departmentName":"Department architectural-building constructions","endDate":"","orcid":"0000-0001-6291-9619","roleTitle":"Assistant professor","startDate":"2014-09"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/02jx3x895"},"departmentName":"Learning and Leadership","endDate":"2012-09-01","orcid":"0000-0002-3210-3034","roleTitle":"Leturer"}
{"affiliationId":{"schema":"RINGGOLD","value":"445071"},"departmentName":"Fisheries and Aquaculture","endDate":"2006-01-012012-08-23","orcid":"0000-0002-9030-7609","roleTitle":"Technical Officer"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/05a28rw58"},"departmentName":"Institute of Environmental Engineering","endDate":"2023-11-012024-01-31","orcid":"0000-0002-9030-7609","roleTitle":"Visiting Researcher"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Faculty of Engineering and Informatics","endDate":"2021-12-20","orcid":"0000-0003-0305-8980","roleTitle":"Lecturer"}
{"affiliationId":{"schema":"RINGGOLD","value":"26066"},"departmentName":"Obstetrics and Gynaecology","orcid":"0000-0003-0305-8980","roleTitle":"MD, PhD"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/00ey9xa07"},"departmentName":"Harbin Sport University","endDate":"2024-06-01","orcid":"0000-0003-0305-8980","roleTitle":"Student"}
{"affiliationId":{"schema":"","value":""},"departmentName":"KIPP DC Schools","endDate":"2016-01-012017-01-01","orcid":"0009-0004-7554-419X","roleTitle":"Middle School Science Teacher"}
{"affiliationId":{"schema":"RINGGOLD","value":"19374"},"departmentName":"Music Therapy","endDate":"","orcid":"0000-0002-5115-9762","roleTitle":"Dementia Program Director","startDate":"2017-10-01"}
{"affiliationId":{"schema":"RINGGOLD","value":"9144"},"departmentName":"Interdisciplinary Center for Scientific Computing - IWR","endDate":"2009-01-152012-10-31","orcid":"0000-0002-2004-4153","roleTitle":"PhD student"}
{"departmentName":"2nd Air Supply Maintenance Center Command","endDate":"2012-08-30","orcid":"0000-0002-4389-9744","roleTitle":"Production Planning Group Supervisor","startDate":"2008-09-01"}
{"affiliationId":{"schema":"","value":""},"departmentName":"MITAKY High-Tech Co., Ltd.","endDate":"2020-07-01","orcid":"0000-0001-7628-743X","roleTitle":"President & CEO"}
{"departmentName":"Ancash","endDate":"","orcid":"0000-0002-3861-2833","startDate":""}
{"departmentName":"CALDAS","endDate":"2010-06-17","orcid":"0000-0003-1077-4053","roleTitle":"COORDINADOR ESTUDIO DE RADIO Y TV","startDate":"1999-03-17"}
{"departmentName":"Institute of Sociogenesis and Social Dynamics","endDate":"","orcid":"0000-0001-6881-7760","startDate":""}
{"affiliationId":{"schema":"RINGGOLD","value":"27004"},"departmentName":"Biology","endDate":"1999-09-02","orcid":"0000-0002-8553-169X","roleTitle":"MCf Microbiology "}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/041nas322"},"departmentName":"Bonn Center for Dependency and Slavery Studies","endDate":"2019-01-012023-01-01","orcid":"0009-0002-9250-948X","roleTitle":"Predoctoral Research Associate / wiss. Mitarbeiter"}
{"departmentName":"Процессы и аппараты химических и пищевых производств","endDate":"","orcid":"0000-0002-1805-5670","roleTitle":"старший преподаватель","startDate":"2015-09-03"}
{"affiliationId":{"schema":"RINGGOLD","value":"95414"},"departmentName":"Department of Science, Technology and International relations","endDate":"2021-03-09","orcid":"0000-0002-7616-2482","roleTitle":"Researcher"}
{"affiliationId":{"schema":"RINGGOLD","value":"28730"},"departmentName":"Forensic Medicine & Toxicology","endDate":"2019-08-26","orcid":"0000-0001-7369-1744","roleTitle":"Senior Resident","startDate":"2016-08-27"}
{"departmentName":"Department of Functional & Comparative Genomics","endDate":"2015-10","orcid":"0000-0001-8059-8919","roleTitle":"Postdoctoral Research Associate, Fluorescence Chemical Sensors","startDate":"2014-08"}
{"affiliationId":{"schema":"RINGGOLD","value":"16771"},"departmentName":"Catedra UNESCO de Gestion y Politica Universitaria","endDate":"2016","orcid":"0000-0001-9437-6700","roleTitle":"Investigador","startDate":"2001"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/05v3pg621"},"departmentName":"Department of Multimedia Animation and Application","endDate":"","orcid":"0000-0003-2513-7065","roleTitle":"Professor","startDate":"2012-02-01"}
{"affiliationId":{"schema":"RINGGOLD","value":"246714"},"departmentName":"Delta Dental of Wisconsin","endDate":"","orcid":"0000-0003-2675-3206","roleTitle":"VP & Science Officer","startDate":"2006"}
{"affiliationId":{"schema":"RINGGOLD","value":"146895"},"departmentName":"Facultad de Ciencias Económicas, administrativas y Contables ","endDate":"2015-02-01","orcid":"0000-0001-9384-6395","roleTitle":"Coordinadora de Investigaciones-Facultad CEAC"}
{"affiliationId":{"schema":"RINGGOLD","value":"8367"},"departmentName":"Microbiology, Immunology and Tropical Medicine","endDate":"2019-10-01","orcid":"0000-0002-2349-263X","roleTitle":"Assistant Professor"}
{"affiliationId":{"schema":"RINGGOLD","value":"129705"},"departmentName":"Institut de Recherche en Informatique Fondamentale","orcid":"0000-0003-0287-6252","roleTitle":""}
{"affiliationId":{"schema":"RINGGOLD","value":"119726"},"departmentName":"Computer Engineering/MIS","endDate":"2018-09-15","orcid":"0000-0003-0014-5106","roleTitle":"Assoc.Prof.Dr."}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/00wbwde85"},"departmentName":"school","endDate":"2021-03-01","orcid":"0009-0007-7585-0594","roleTitle":"mahasiswa"}
{"affiliationId":{"schema":"RINGGOLD","value":"9171"},"departmentName":"Computer Science Dept.","endDate":"","orcid":"0000-0002-3202-2904","startDate":"2002-04-01"}
{"affiliationId":{"schema":"GRID","value":"grid.474837.b"},"departmentName":"Gastroenterology","endDate":"","orcid":"0000-0003-0705-5760","startDate":""}
{"affiliationId":{"schema":"","value":""},"departmentName":"School of Physical and Occupational Therapy","endDate":"2018-03-012021-12-31","orcid":"0000-0002-8406-5228","roleTitle":"Postdoctoral Fellow"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Xinqiao Hospital","orcid":"0000-0001-6200-2309","roleTitle":""}
{"affiliationId":{"schema":"","value":""},"departmentName":"DIPARTIMENTO DI PATOLOGIA CHIRURGICA, MEDICA, MOLECOLARE E DELL'AREA CRITICA","endDate":"2018-10-012021-10-01","orcid":"0000-0002-5588-2608","roleTitle":"Ricercatori a tempo determinato"}
{"affiliationId":{"schema":"","value":""},"departmentName":"ECE","endDate":"2021-01-012022-02-01","orcid":"0000-0002-8729-0287","roleTitle":"Visiting Scholar"}
{"departmentName":"Mantenimiento","endDate":"1998-01-01","orcid":"0000-0002-8663-2716","roleTitle":"Project coordinator","startDate":"1994-01-01"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/012wtwr40"},"departmentName":"Centro Universitário Newton Paiva","endDate":"2012-07","orcid":"0000-0002-1725-1805","roleTitle":"Professor Assistente","startDate":"2005-09"}
{"departmentName":"DFC","endDate":"","orcid":"0000-0003-3764-9500","roleTitle":"Professora Substituta","startDate":"2018-02-01"}
{"affiliationId":{"schema":"RINGGOLD","value":"10107"},"departmentName":"Automated Lab - Women's & Children's Hospital site","endDate":"1995-09-052018-09-09","orcid":"0000-0002-5594-9737","roleTitle":"Medical Scientist"}
{"affiliationId":{"schema":"RINGGOLD","value":"33784"},"departmentName":"Computer Science","endDate":"2023-06-04","orcid":"0009-0000-6585-6246","roleTitle":"Visiting Assistant Professor","startDate":"2023-01-09"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/0280a3n32"},"departmentName":"Research","endDate":"2022-06","orcid":"0000-0002-0846-9503","roleTitle":"Research Assistant","startDate":"2019-06"}
{"affiliationId":{"schema":"GRID","value":"grid.8657.c"},"departmentName":"Finnish Meteorological Institute","endDate":"2019-06-01","orcid":"0000-0002-4826-2929","roleTitle":""}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/05290cv24"},"departmentName":"Dipartimento di Informatica e Tecnologie dell'Informazione","endDate":"2022-11-012025-11-01","orcid":"0009-0000-6476-8092","roleTitle":"PhD Student"}
{"affiliationId":{"schema":"GRID","value":"grid.495082.2"},"departmentName":"Laboratory of water bodies sanitaric microbiology and human microbial ecology","endDate":"2017-01-01","orcid":"0000-0003-1194-7251","roleTitle":"Senior reseacher "}
{"affiliationId":{"schema":"RINGGOLD","value":"150713"},"departmentName":"Education","endDate":"2013-09-01","orcid":"0000-0002-2489-1202","roleTitle":"Doctor of Education"}
{"departmentName":"Office of Risk Management","endDate":"","orcid":"0000-0003-2772-313X","roleTitle":"Senior Policy Advisor","startDate":"2014-09-04"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/04pe1sa24"},"departmentName":"Facultad de Estudios Globales y Hospitalidad","endDate":"","orcid":"0009-0003-4270-4196","roleTitle":"Docente en las Carreras de Licenciatura en Turismo y Relaciones Internacionales","startDate":"2023-06-12"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Civil & Mechanical Engineering","endDate":"2014-06-012020-06-30","orcid":"0000-0001-6598-2525","roleTitle":"Assistant Professor"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/02ytfzr55"},"departmentName":"Department of Civil Engineering ","endDate":"2022-03-212024-02-27","orcid":"0000-0002-9572-1358","roleTitle":"Temporary Faculty"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/04wdt0z89"},"departmentName":"library","endDate":"2024-03-01","orcid":"0009-0002-8124-1772","roleTitle":"library it"}
{"departmentName":"Kalil e Pires Advogados","endDate":"","orcid":"0009-0001-3403-0297","roleTitle":"Estagiário","startDate":"2023-03-13"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Coronel Institute of Occupational Health","endDate":"2019-01-01","orcid":"0000-0002-0461-4013","roleTitle":"Principal Investigator"}
{"affiliationId":{"schema":"GRID","value":"grid.5801.c"},"departmentName":"Health Sciences and Technology","orcid":"0000-0002-1651-0457","roleTitle":"Doctoral student"}
{"departmentName":"Prefeitura de Teresina","endDate":"","orcid":"0000-0002-8148-4179","startDate":""}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/00k8rrx20"},"departmentName":"Prosseguir","endDate":"2023-05-012024-05-01","orcid":"0000-0001-5147-3455","roleTitle":"Coordenadora Pedagógica Regional do Prosseguir em Manaus"}
{"affiliationId":{"schema":"RINGGOLD","value":"381864"},"departmentName":"Pharmaron Beijing Co Ltd","endDate":"","orcid":"0000-0003-2165-740X","startDate":"2015-10-27"}
{"affiliationId":{"schema":"RINGGOLD","value":"183390"},"departmentName":"Instituto Tecnológico Superior de Irapuato","endDate":"","orcid":"0000-0003-2101-5917","startDate":"2018-11-01"}
{"affiliationId":{"schema":"RINGGOLD","value":"384754"},"departmentName":"SynCat@Beijing","orcid":"0000-0002-1050-2165","roleTitle":"Vice Director"}
{"affiliationId":{"schema":"RINGGOLD","value":"282795"},"departmentName":"Setor de Coleções Científicas","endDate":"","orcid":"0000-0003-3755-0025","roleTitle":"Estagiária","startDate":"2019-11"}
{"affiliationId":{"schema":"RINGGOLD","value":"434589"},"departmentName":"Chemistry","endDate":"2015-05-25","orcid":"0000-0001-5861-4425","roleTitle":"Lecturer"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Cajamarca","endDate":"2023-11-15","orcid":"0000-0003-1524-3315","roleTitle":"Asistente Administrativo Provincial"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Seduc ma","endDate":"2012-01-24","orcid":"0000-0003-3142-356X","roleTitle":""}
{"departmentName":"Rede Particular de Ensino","endDate":"2021","orcid":"0000-0002-4771-2131","roleTitle":"Professora de Artes Cênicas","startDate":"2018"}
{"affiliationId":{"schema":"FUNDREF","value":"http://dx.doi.org/10.13039/100009042"},"departmentName":"Derecho del Trabajo y de la Seguridad Social","endDate":"2002-12-01","orcid":"0000-0002-1275-5289","roleTitle":"Catedrático de Derecho del Trabajo y de la Seguridad Social"}
{"departmentName":"Department of Ethology","endDate":"2011-01-02","orcid":"0000-0003-1436-7324","roleTitle":"Research Assistant","startDate":"2007-09-01"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/04ka8rx28"},"departmentName":"Mechanical Engineering","orcid":"0009-0006-6397-2183","roleTitle":""}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/002qhr126"},"departmentName":"Theatre","orcid":"0009-0001-6531-9624","roleTitle":"theacher"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Institute of Molecular Medicine, Renji Hospital, School of Medicine.","endDate":"2021-01-27","orcid":"0000-0003-0399-1201","roleTitle":"Associate Professor"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Cell Biology","endDate":"2013-11-012014-07-01","orcid":"0000-0003-1489-4757","roleTitle":"Research Scolarship for Undergraduate Students"}
{"affiliationId":{"schema":"RINGGOLD","value":"346985"},"departmentName":"Maternidade","endDate":"","orcid":"0000-0002-6985-9679","roleTitle":"Enfermeira/UTI neonatal","startDate":"2019-02-26"}
{"affiliationId":{"schema":"RINGGOLD","value":"41726"},"departmentName":"Area Team - Biodiversity","endDate":"2006-01-012008-01-01","orcid":"0000-0002-6553-3786","roleTitle":"Catchment Biodiversity Technical Officer"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/04z7qrj66"},"departmentName":"Merchant Marine College","orcid":"0009-0003-6812-3576","roleTitle":""}
{"affiliationId":{"schema":"","value":""},"departmentName":"Lima","endDate":"2022-03-25","orcid":"0000-0002-9262-5619","roleTitle":"Docente Universitario"}
{"affiliationId":{"schema":"","value":""},"departmentName":"University of Bristol","orcid":"0000-0002-9793-3485","roleTitle":""}
{"departmentName":"US Geological Survey, Ecosystems Mission Area, Cooperative Fish and Wildlife Research Units Program","endDate":"","orcid":"0000-0002-8638-6682","startDate":"2011-06-01"}
{"affiliationId":{"schema":"RINGGOLD","value":"28666"},"departmentName":"English","endDate":"","orcid":"0000-0001-5361-109X","roleTitle":"Assistant Professor","startDate":"2019-07-05"}
{"affiliationId":{"schema":"","value":""},"departmentName":"Centre for Earth System Science","endDate":"2010-03-012012-11-01","orcid":"0000-0001-5323-4431","roleTitle":"Researcher and Executive Officer"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/05bjd0w70"},"departmentName":"Education","endDate":"2013-08-15","orcid":"0000-0001-5960-0586","roleTitle":"Associate Professor and Chair, Department of Education","startDate":"2002-08-15"}
{"affiliationId":{"schema":"RINGGOLD","value":"632513"},"departmentName":"Board ","endDate":"2020-01-01","orcid":"0000-0002-4222-4518","roleTitle":"Boardmember"}
{"affiliationId":{"schema":"GRID","value":"grid.22657.34"},"departmentName":"Faculty of Food Technology","endDate":"2017-01-01","orcid":"0000-0003-2606-8380","roleTitle":"guest scientific assistant, guest researcher, guest lecturer"}
{"affiliationId":{"schema":"FUNDREF","value":"http://dx.doi.org/10.13039/501100008331"},"departmentName":"Radiology","endDate":"","orcid":"0000-0001-6249-450X","startDate":"2016-03-01"}
{"departmentName":"кафедра физики","endDate":"","orcid":"0000-0001-6786-838X","roleTitle":"доцент","startDate":"1981-11-28"}
{"affiliationId":{"schema":"RINGGOLD","value":"6429"},"departmentName":"Molecular and Cellular Physiology","endDate":"2010-01-012013-01-01","orcid":"0000-0002-5538-0464","roleTitle":"Research-Associate"}
{"affiliationId":{"schema":"RINGGOLD","value":"48455"},"departmentName":"Clinical Biochem","endDate":"2016-12-30","orcid":"0000-0002-9563-8044","roleTitle":"Associate Professor","startDate":"2008"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/01xf75524"},"departmentName":"Molecular Oncology","endDate":"2022-01-01","orcid":"0000-0003-0928-003X","roleTitle":""}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/022kthw22"},"departmentName":"Anesthesiology and Perioperative Medicine","endDate":"2022-07-01","orcid":"0000-0001-7410-7271","roleTitle":"Postdoctoral Researcher"}
{"affiliationId":{"schema":"RINGGOLD","value":"125792"},"departmentName":"Environmental Management and Toxicology","endDate":"2006-06-20","orcid":"0000-0001-7855-4183","roleTitle":"Instructional/Tutorial Facilitator"}
{"departmentName":"2004 2007 | Teacher of Fiqh and Usul-al-Fiqh | Islamic University | Moscow, Russia","endDate":"","orcid":"0000-0001-8386-4426","startDate":""}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/0190ak572"},"departmentName":"Biology","endDate":"2024-06-01","orcid":"0009-0001-6766-7876","roleTitle":"Research assistant"}
{"affiliationId":{"schema":"RINGGOLD","value":"16763"},"departmentName":"Education","orcid":"0000-0003-2355-4682","roleTitle":"Profesora titular de Universidad/ Senior Lecturer "}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/0406jsq08"},"departmentName":"Farmácia","endDate":"2023-03-01","orcid":"0009-0009-1661-5046","roleTitle":"Residente farmacêutico"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/02aqsxs83"},"departmentName":"School of Biological Sciences","endDate":"2021-08-16","orcid":"0000-0002-1696-1952","roleTitle":"Assistant Professor"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/00s582s04"},"departmentName":"cajamarca","endDate":"2024-01-01","orcid":"0009-0001-0970-2741","roleTitle":"BACHILLER"}
{"affiliationId":{"schema":"RINGGOLD","value":"186027"},"departmentName":"РЯиК","endDate":"","orcid":"0000-0002-1000-5441","roleTitle":"старший преподаватель","startDate":"2007-09-01"}
{"affiliationId":{"schema":"RINGGOLD","value":"47910"},"departmentName":"Faculty of Life Science and Technology","endDate":"2012-07-082015-10-01","orcid":"0000-0001-7533-998X","roleTitle":"Lecturer"}
{"affiliationId":{"schema":"RINGGOLD","value":"2234"},"departmentName":"Education","endDate":"","orcid":"0000-0001-6123-8483","roleTitle":"Assistant Professor","startDate":"2019-01-07"}
{"affiliationId":{"schema":"ROR","value":"https://ror.org/04qkymg17"},"departmentName":"genera surgical ward","endDate":"2013-10-012015-02-01","orcid":"0009-0009-7638-0453","roleTitle":"Registered Nurse"}
{"affiliationId":{"schema":"RINGGOLD","value":"575342"},"departmentName":"GBUZ Naucno-prakticeskij psihonevrologiceskij centr imeni Z P Solov'eva Departamenta zdravoohranenia goroda Moskvy","endDate":"2022-11-01","orcid":"0000-0002-0344-9765","roleTitle":""}
{"affiliationId":{"schema":"","value":""},"departmentName":"SERVICE DE CHIRURGIE ORTHOPÉDIQUE ET TRAUMATOLOGIE","endDate":"2024-02-01","orcid":"0009-0003-2119-1766","roleTitle":"CHIRURGIEN ORTHOPEDISTE"}

View File

@ -0,0 +1 @@
{"doi":"https://doi.org/10.1007/978-3-030-75768-7","OpenAlexId":"https://openalex.org/W2124362779"}

View File

@ -8,7 +8,6 @@ import java.nio.charset.StandardCharsets;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@ -23,7 +22,6 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson; import com.google.gson.Gson;
@ -32,6 +30,8 @@ import eu.dnetlib.dhp.api.model.CommunityEntityMap;
import eu.dnetlib.dhp.api.model.EntityCommunities; import eu.dnetlib.dhp.api.model.EntityCommunities;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bulktag.community.*; import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.Context;
@ -93,9 +93,10 @@ public class SparkBulkTagJob {
ProtoMap protoMap = new Gson().fromJson(temp, ProtoMap.class); ProtoMap protoMap = new Gson().fromJson(temp, ProtoMap.class);
log.info("pathMap: {}", new Gson().toJson(protoMap)); log.info("pathMap: {}", new Gson().toJson(protoMap));
SelectionConstraints taggingConstraints = new Gson() TaggingConstraints taggingConstraints = new Gson()
.fromJson(parser.get("taggingCriteria"), SelectionConstraints.class); .fromJson(parser.get("taggingCriteria"), TaggingConstraints.class);
taggingConstraints.setSelection(VerbResolverFactory.newInstance());
taggingConstraints.getTags().forEach(t -> t.setSelection(VerbResolverFactory.newInstance()));
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
CommunityConfiguration cc; CommunityConfiguration cc;
@ -277,13 +278,8 @@ public class SparkBulkTagJob {
String outputPath, String outputPath,
ProtoMap protoMappingParams, ProtoMap protoMappingParams,
CommunityConfiguration communityConfiguration, CommunityConfiguration communityConfiguration,
SelectionConstraints taggingConstraints) { TaggingConstraints taggingConstraints) {
try {
System.out.println(new ObjectMapper().writeValueAsString(protoMappingParams));
} catch (JsonProcessingException e) {
throw new RuntimeException(e);
}
ModelSupport.entityTypes ModelSupport.entityTypes
.keySet() .keySet()
.parallelStream() .parallelStream()
@ -295,30 +291,22 @@ public class SparkBulkTagJob {
readPath(spark, inputPath + e.name(), resultClazz) readPath(spark, inputPath + e.name(), resultClazz)
.map(patchResult(), Encoders.bean(resultClazz)) .map(patchResult(), Encoders.bean(resultClazz))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map((MapFunction<R, Tagging>) value -> resultTagger .map(
(MapFunction<R, R>) value -> resultTagger
.enrichContextCriteria( .enrichContextCriteria(
value, communityConfiguration, protoMappingParams, taggingConstraints), value, communityConfiguration, protoMappingParams, taggingConstraints),
Encoders.bean(Tagging.class)) Encoders.bean(resultClazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath + e.name());// writing the tagging in the working dir for entity .json(outputPath + e.name());// writing the tagging in the working dir for entity
readPath(spark, outputPath + e.name(), Tagging.class) readPath(spark, outputPath + e.name(), resultClazz)
.map((MapFunction<Tagging, R>) t -> (R) t.getResult(), Encoders.bean(resultClazz) )// copy the tagging in the actual result output path
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(inputPath + e.name()); .json(inputPath + e.name());
readPath(spark, outputPath + e.name(), Tagging.class)
.map((MapFunction<Tagging, String>) t -> t.getTag(), Encoders.STRING() )// copy the tagging in the actual result output path
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json("/user/miriam.baglioni/graphTagging/" + e.name());
}); });
} }

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.bulktag; package eu.dnetlib.dhp.bulktag;
import java.io.Serializable; import java.io.Serializable;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
public class Tagging<R extends Result> implements Serializable { public class Tagging<R extends Result> implements Serializable {

View File

@ -10,8 +10,6 @@ import java.lang.reflect.Method;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.bulktag.Tagging;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -21,6 +19,7 @@ import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.PathNotFoundException; import com.jayway.jsonpath.PathNotFoundException;
import eu.dnetlib.dhp.bulktag.Tagging;
import eu.dnetlib.dhp.bulktag.actions.MapModel; import eu.dnetlib.dhp.bulktag.actions.MapModel;
import eu.dnetlib.dhp.bulktag.actions.Parameters; import eu.dnetlib.dhp.bulktag.actions.Parameters;
import eu.dnetlib.dhp.bulktag.eosc.EoscIFTag; import eu.dnetlib.dhp.bulktag.eosc.EoscIFTag;
@ -93,18 +92,18 @@ public class ResultTagger implements Serializable {
} }
public <R extends Result> Tagging enrichContextCriteria( public <R extends Result> R enrichContextCriteria(
final R result, final CommunityConfiguration conf, final Map<String, MapModel> criteria, SelectionConstraints taggingConstraints) final R result, final CommunityConfiguration conf, final Map<String, MapModel> criteria,
TaggingConstraints taggingConstraints)
throws InvocationTargetException, NoSuchMethodException { throws InvocationTargetException, NoSuchMethodException {
// Verify if the entity is deletedbyinference. In case verify if to clean the context list // Verify if the entity is deletedbyinference. In case verify if to clean the context list
// from all the zenodo communities // from all the zenodo communities
if (result.getDataInfo().getDeletedbyinference()) { if (result.getDataInfo().getDeletedbyinference()) {
clearContext(result); clearContext(result);
return Tagging.newInstance(result, null); return result;
} }
String retString = null;
final Map<String, List<String>> param = getParamMap(result, criteria); final Map<String, List<String>> param = getParamMap(result, criteria);
// Execute the EOSCTag for the services // Execute the EOSCTag for the services
@ -123,8 +122,11 @@ public class ResultTagger implements Serializable {
} }
//adding code for tagging of results searching supplementaryMaterial //adding code for tagging of results searching supplementaryMaterial
if(taggingConstraints.getCriteria().stream().anyMatch(crit -> crit.verifyCriteria(param))) final Set<String> tags = new HashSet<>();
retString = "supplementary"; taggingConstraints.getTags().forEach(t -> {
if (t.getCriteria().stream().anyMatch(crit -> crit.verifyCriteria(param)))
tags.add(t.getTagId());
});
// communities contains all the communities to be not added to the context // communities contains all the communities to be not added to the context
final Set<String> removeCommunities = new HashSet<>(); final Set<String> removeCommunities = new HashSet<>();
@ -253,10 +255,26 @@ public class ResultTagger implements Serializable {
clearContext(result); clearContext(result);
/* Verify if there is something to bulktag */ /* Verify if there is something to bulktag */
if (communities.isEmpty()) { if (communities.isEmpty() && tags.isEmpty()) {
return Tagging.newInstance(result, retString); return result;
} }
tags.forEach(t -> {
Context con = new Context();
con.setId(t);
List<DataInfo> dataInfoList = Arrays
.asList(
OafMapperUtils
.dataInfo(
false, ANNOTATION_DATA_INFO_TYPE, true, false,
OafMapperUtils
.qualifier(
CLASS_ID_ANNOTATION, CLASS_NAME_ANNOTATION, DNET_PROVENANCE_ACTIONS,
DNET_PROVENANCE_ACTIONS),
TAGGING_TRUST));
result.getContext().add(con);
});
result.getContext().forEach(c -> { result.getContext().forEach(c -> {
final String cId = c.getId(); final String cId = c.getId();
if (communities.contains(cId)) { if (communities.contains(cId)) {
@ -321,7 +339,7 @@ public class ResultTagger implements Serializable {
result.getContext().stream().map(Context::getId).collect(Collectors.toSet())); result.getContext().stream().map(Context::getId).collect(Collectors.toSet()));
if (communities.isEmpty()) if (communities.isEmpty())
return Tagging.newInstance(result, retString); return result;
List<Context> toaddcontext = communities List<Context> toaddcontext = communities
.stream() .stream()
@ -381,7 +399,7 @@ public class ResultTagger implements Serializable {
.collect(Collectors.toList()); .collect(Collectors.toList());
result.getContext().addAll(toaddcontext); result.getContext().addAll(toaddcontext);
return Tagging.newInstance(result, retString); return result;
} }
} }

View File

@ -8,6 +8,10 @@ public class TaggingConstants {
public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging"; public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging";
public static final String ANNOTATION_DATA_INFO_TYPE = "annotation";
public static final String CLASS_ID_ANNOTATION = "graph:annotation";
public static final String CLASS_NAME_ANNOTATION = "Graph Annotation";
public static final String CLASS_ID_SUBJECT = "community:subject"; public static final String CLASS_ID_SUBJECT = "community:subject";
public static final String CLASS_ID_DATASOURCE = "community:datasource"; public static final String CLASS_ID_DATASOURCE = "community:datasource";
public static final String CLASS_ID_CZENODO = "community:zenodocommunity"; public static final String CLASS_ID_CZENODO = "community:zenodocommunity";

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.bulktag.community;
public class TaggingConstraint extends SelectionConstraints {
private String tagId;
public String getTagId() {
return tagId;
}
public void setTagId(String tagId) {
this.tagId = tagId;
}
}

View File

@ -0,0 +1,16 @@
package eu.dnetlib.dhp.bulktag.community;
import java.util.List;
public class TaggingConstraints {
private List<TaggingConstraint> tags;
public List<TaggingConstraint> getTags() {
return tags;
}
public void setTags(List<TaggingConstraint> tags) {
this.tags = tags;
}
}

View File

@ -0,0 +1,4 @@
sourcePath=/tmp/miriam/12_graph_copy
pathMap=/data/bulktagging/pathMap
baseURL=https://services.openaire.eu/openaire/community/
taggingCriteria={"tags":[{"id":"SM","criteria":[{"constraint":[{"verb":"starts_with_caseinsensitive","field":"title","value":"supplementary material for"},{"verb":"starts_with_caseinsensitive","field":"title","value":"supplementary document for"},{"verb":"starts_with_caseinsensitive","field":"title","value":"figure"},{"verb":"starts_with_caseinsensitive","field":"title","value":"supplementary figure"},{"verb":"starts_with_caseinsensitive","field":"title","value":"supplemental figure"},{"verb":"starts_with_caseinsensitive","field":"title","value":"supplementary table"},{"verb":"starts_with_caseinsensitive","field":"title","value":"table for"}]}]}]}

View File

@ -5,7 +5,6 @@ import java.io.StringReader;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.solr.ExternalReference;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
@ -31,6 +30,7 @@ import eu.dnetlib.dhp.schema.solr.Context;
import eu.dnetlib.dhp.schema.solr.Country; import eu.dnetlib.dhp.schema.solr.Country;
import eu.dnetlib.dhp.schema.solr.Datasource; import eu.dnetlib.dhp.schema.solr.Datasource;
import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines; import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines;
import eu.dnetlib.dhp.schema.solr.ExternalReference;
import eu.dnetlib.dhp.schema.solr.Instance; import eu.dnetlib.dhp.schema.solr.Instance;
import eu.dnetlib.dhp.schema.solr.Journal; import eu.dnetlib.dhp.schema.solr.Journal;
import eu.dnetlib.dhp.schema.solr.Measure; import eu.dnetlib.dhp.schema.solr.Measure;
@ -562,10 +562,16 @@ public class ProvisionModelSupport {
.orElse(null); .orElse(null);
} }
private static List<ExternalReference> mapExternalReference(List<eu.dnetlib.dhp.schema.oaf.ExternalReference> externalReference) { private static List<ExternalReference> mapExternalReference(
return Optional.ofNullable(externalReference) List<eu.dnetlib.dhp.schema.oaf.ExternalReference> externalReference) {
.map(ext -> ext.stream() return Optional
.map(e -> ExternalReference.newInstance( .ofNullable(externalReference)
.map(
ext -> ext
.stream()
.map(
e -> ExternalReference
.newInstance(
e.getSitename(), e.getSitename(),
e.getLabel(), e.getLabel(),
e.getAlternateLabel(), e.getAlternateLabel(),

View File

@ -1,12 +1,13 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.nio.file.Path; import java.nio.file.Path;
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
@ -32,14 +33,13 @@ import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import static org.junit.jupiter.api.Assertions.assertEquals; import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
public class SolrConfigExploreTest { public class SolrConfigExploreTest {
@ -180,7 +180,8 @@ public class SolrConfigExploreTest {
new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize) new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
.run(isLookupClient); .run(isLookupClient);
Assertions.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); Assertions
.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus());
String[] queryStrings = { String[] queryStrings = {
"cancer", "cancer",
@ -200,7 +201,8 @@ public class SolrConfigExploreTest {
// System.out.println(rsp.getExplainMap()); // System.out.println(rsp.getExplainMap());
for (SolrDocument doc : rsp.getResults()) { for (SolrDocument doc : rsp.getResults()) {
log.info( log
.info(
doc.get("score") + "\t" + doc.get("score") + "\t" +
doc.get("__indexrecordidentifier") + "\t" + doc.get("__indexrecordidentifier") + "\t" +
doc.get("resultidentifier") + "\t" + doc.get("resultidentifier") + "\t" +

View File

@ -85,7 +85,8 @@ public class SolrConfigTest extends SolrTest {
new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize) new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
.run(isLookupClient); .run(isLookupClient);
Assertions.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus()); Assertions
.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus());
String[] queryStrings = { String[] queryStrings = {
"cancer", "cancer",