implementation for bulk tagging

This commit is contained in:
Miriam Baglioni 2020-03-03 16:38:50 +01:00
parent e80f80ca93
commit d9d2060561
24 changed files with 1307 additions and 1 deletions

View File

@ -5,11 +5,46 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.0.5-SNAPSHOT</version> <version>1.1.6-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-bulktag</artifactId> <artifactId>dhp-bulktag</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.11</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project> </project>

View File

@ -0,0 +1,64 @@
package eu.dnetlib.dhp;
import com.google.gson.Gson;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.ArrayList;
import java.util.List;
/**
* Created by miriam on 01/08/2018.
*/
public class Community {
private static final Log log = LogFactory.getLog(Community.class);
private String id;
private List<String> subjects = new ArrayList<>();
private List<Datasource> datasources = new ArrayList<>();
private List<ZenodoCommunity> zenodoCommunities = new ArrayList<>();
public String toJson() {
final Gson g = new Gson();
return g.toJson(this);
}
public boolean isValid() {
return !getSubjects().isEmpty() || !getDatasources().isEmpty() || !getZenodoCommunities().isEmpty();
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<String> getSubjects() {
return subjects;
}
public void setSubjects(List<String> subjects) {
this.subjects = subjects;
}
public List<Datasource> getDatasources() {
return datasources;
}
public void setDatasources(List<Datasource> datasources) {
this.datasources = datasources;
}
public List<ZenodoCommunity> getZenodoCommunities() {
return zenodoCommunities;
}
public void setZenodoCommunities(List<ZenodoCommunity> zenodoCommunities) {
this.zenodoCommunities = zenodoCommunities;
}
}

View File

@ -0,0 +1,170 @@
package eu.dnetlib.dhp;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Created by miriam on 02/08/2018.
*/
public class CommunityConfiguration {
private static final Log log = LogFactory.getLog(CommunityConfiguration.class);
private Map<String,Community> communities;
//map subject -> communityid
private transient Map<String,List<Pair<String,SelectionConstraints>>> subjectMap = new HashMap<>();
//map datasourceid -> communityid
private transient Map<String,List<Pair<String,SelectionConstraints>>> datasourceMap = new HashMap<>();
//map zenodocommunityid -> communityid
private transient Map<String,List<Pair<String,SelectionConstraints>>> zenodocommunityMap = new HashMap<>();
CommunityConfiguration(final Map<String, Community> communities) {
this.communities = communities;
init();
}
void init() {
if (subjectMap == null) {
subjectMap = Maps.newHashMap();
}
if (datasourceMap == null) {
datasourceMap = Maps.newHashMap();
}
if (zenodocommunityMap == null) {
zenodocommunityMap = Maps.newHashMap();
}
for(Community c : getCommunities().values()) {
//get subjects
final String id = c.getId();
for(String sbj : c.getSubjects()){
Pair<String,SelectionConstraints> p = new Pair<>(id,new SelectionConstraints());
add(sbj.toLowerCase().trim() , p, subjectMap);
}
//get datasources
for(Datasource d: c.getDatasources()){
add(d.getOpenaireId(),new Pair<>(id,d.getSelectionConstraints()),datasourceMap);
}
//get zenodo communities
for(ZenodoCommunity zc : c.getZenodoCommunities()){
add(zc.getZenodoCommunityId(),new Pair<>(id,zc.getSelCriteria()),zenodocommunityMap);
}
}
}
private void add(String key, Pair<String,SelectionConstraints> value, Map<String,List<Pair<String,SelectionConstraints>>> map){
List<Pair<String,SelectionConstraints>> values = map.get(key);
if (values == null){
values = new ArrayList<>();
map.put(key,values);
}
values.add(value);
}
public List<Pair<String,SelectionConstraints>> getCommunityForSubject(String sbj){
return subjectMap.get(sbj);
}
public List<Pair<String,SelectionConstraints>> getCommunityForDatasource(String dts){
return datasourceMap.get(dts);
}
public List<String> getCommunityForDatasource(final String dts, final Map<String, List<String>> param) {
List<Pair<String,SelectionConstraints>> lp = datasourceMap.get(dts);
if (lp==null)
return Lists.newArrayList();
return lp.stream().map(p -> {
if (p.getSnd() == null)
return p.getFst();
if (((SelectionConstraints) p.getSnd()).verifyCriteria(param))
return p.getFst();
else
return null;
}).filter(st->(st!=null)).collect(Collectors.toList());
}
public List<Pair<String,SelectionConstraints>> getCommunityForZenodoCommunity(String zc){
return zenodocommunityMap.get(zc);
}
public List<String> getCommunityForSubjectValue(String value) {
return getContextIds(subjectMap.get(value));
}
public List<String> getCommunityForDatasourceValue(String value) {
return getContextIds(datasourceMap.get(value.toLowerCase()));
}
public List<String> getCommunityForZenodoCommunityValue(String value){
return getContextIds(zenodocommunityMap.get(value.toLowerCase()));
}
private List<String> getContextIds(List<Pair<String, SelectionConstraints>> list) {
if (list != null) {
return list.stream().map(p -> p.getFst()).collect(Collectors.toList());
}
return Lists.newArrayList();
}
public Map<String, Community> getCommunities() {
return communities;
}
public void setCommunities(Map<String, Community> communities) {
this.communities = communities;
}
public String toJson() {
GsonBuilder builder = new GsonBuilder();
builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
Gson gson = builder.create();
return gson.toJson(this);
}
public int size() {
return communities.keySet().size();
}
public Community getCommunityById(String id){
return communities.get(id);
}
public List<Community> getCommunityList() {
return Lists.newLinkedList(communities.values());
}
}

View File

@ -0,0 +1,146 @@
package eu.dnetlib.dhp;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import eu.dnetlib.dhp.selectioncriteria.VerbResolverFactory;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Created by miriam on 03/08/2018.
*/
public class CommunityConfigurationFactory {
private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class);
private static VerbResolver resolver = VerbResolverFactory.newInstance();
public static CommunityConfiguration newInstance(final String xml) throws DocumentException {
log.debug(String.format("parsing community configuration from:\n%s", xml));
final Document doc = new SAXReader().read(new StringReader(xml));
final Map<String,Community> communities = Maps.newHashMap();
for(final Object o : doc.selectNodes("//community")) {
final Node node = (Node) o;
final Community community = parseCommunity(node);
if (community.isValid()) {
communities.put(community.getId(), community);
}
}
log.info(String.format("loaded %s community configuration profiles", communities.size()));
log.debug(String.format("loaded community configuration:\n%s", communities.toString()));
return new CommunityConfiguration(communities);
}
public static CommunityConfiguration fromJson(final String json) {
GsonBuilder builder = new GsonBuilder();
builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
Gson gson = builder.create();
final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class);
log.info(String.format("loaded %s community configuration profiles", conf.size()));
conf.init();
log.info("created inverse maps");
return conf;
}
private static Community parseCommunity(final Node node) {
final Community c = new Community();
c.setId(node.valueOf("./@id"));
log.info(String.format("community id: %s", c.getId()));
c.setSubjects(parseSubjects(node));
c.setDatasources(parseDatasources(node));
c.setZenodoCommunities(parseZenodoCommunities(node));
return c;
}
private static List<String> parseSubjects(final Node node) {
final List<String> subjects = Lists.newArrayList();
final List <Node> list = node.selectNodes("./subjects/subject");
for(Node n : list){
log.debug("text of the node " + n.getText());
subjects.add(StringUtils.trim(n.getText()));
}
log.info("size of the subject list " + subjects.size());
return subjects;
}
private static List<Datasource> parseDatasources(final Node node) {
final List <Node> list = node.selectNodes("./datasources/datasource");
final List<Datasource> datasourceList = new ArrayList<>();
for(Node n : list){
Datasource d = new Datasource();
d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
datasourceList.add(d);
}
log.info("size of the datasource list " + datasourceList.size());
return datasourceList;
}
private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) {
final Node oacommunitynode = node.selectSingleNode("./oacommunity");
String oacommunity = null;
if (oacommunitynode != null){
String tmp = oacommunitynode.getText();
if(StringUtils.isNotBlank(tmp))
oacommunity = tmp;
}
final List<Node> list = node.selectNodes("./zenodocommunities/zenodocommunity");
final List<ZenodoCommunity> zenodoCommunityList = new ArrayList<>();
for(Node n : list){
ZenodoCommunity zc = new ZenodoCommunity();
zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText());
zc.setSelCriteria(n.selectSingleNode("./selcriteria"));
zenodoCommunityList.add(zc);
}
if(oacommunity != null){
ZenodoCommunity zc = new ZenodoCommunity();
zc.setZenodoCommunityId(oacommunity);
zenodoCommunityList.add(zc);
}
log.info("size of the zenodo community list " + zenodoCommunityList.size());
return zenodoCommunityList;
}
}

View File

@ -0,0 +1,62 @@
package eu.dnetlib.dhp;
import eu.dnetlib.dhp.selectioncriteria.Selection;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
public class Constraint implements Serializable {
private String verb;
private String field;
private String value;
private Selection selection;
public Constraint() {
}
public String getVerb() {
return verb;
}
public void setVerb(String verb) {
this.verb = verb;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public void setSelection(Selection sel){
selection = sel;
}
public void setSelection(VerbResolver resolver) throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException {
selection = resolver.getSelectionCriteria(verb,value);
}
public boolean verifyCriteria(String metadata){
return selection.apply(metadata);
}
}

View File

@ -0,0 +1,79 @@
package eu.dnetlib.dhp;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Type;
import java.util.Collection;
import java.util.List;
import java.util.Map;
/**
* Created by miriam on 02/08/2018.
*/
public class Constraints implements Serializable {
private static final Log log = LogFactory.getLog(Constraints.class);
//private ConstraintEncapsulator ce;
private List<Constraint> constraint;
public Constraints() {
}
public List<Constraint> getConstraint() {
return constraint;
}
public void setConstraint(List<Constraint> constraint) {
this.constraint = constraint;
}
public void setSc(String json){
Type collectionType = new TypeToken<Collection<Constraint>>(){}.getType();
constraint = new Gson().fromJson(json, collectionType);
}
void setSelection(VerbResolver resolver) {
for(Constraint st: constraint){
try {
st.setSelection(resolver);
} catch (NoSuchMethodException e) {
log.error(e.getMessage());
} catch (IllegalAccessException e) {
log.error(e.getMessage());
} catch (InvocationTargetException e) {
log.error(e.getMessage());
} catch (InstantiationException e) {
log.error(e.getMessage());
}
}
}
//Constraint in and
public boolean verifyCriteria(final Map<String, List<String>> param) {
for(Constraint sc : constraint) {
boolean verified = false;
for(String value : param.get(sc.getField())){
if (sc.verifyCriteria(value.trim())){
verified = true;
}
}
if(!verified)
return verified;
}
return true;
}
}

View File

@ -0,0 +1,65 @@
package eu.dnetlib.dhp;
import com.google.gson.Gson;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Node;
/**
* Created by miriam on 01/08/2018.
*/
public class Datasource {
private static final Log log = LogFactory.getLog(Datasource.class);
private String openaireId;
private SelectionConstraints selectionConstraints;
public SelectionConstraints getSelCriteria() {
return selectionConstraints;
}
public SelectionConstraints getSelectionConstraints() {
return selectionConstraints;
}
public void setSelectionConstraints(SelectionConstraints selectionConstraints) {
this.selectionConstraints = selectionConstraints;
}
public void setSelCriteria(SelectionConstraints selCriteria) {
this.selectionConstraints = selCriteria;
}
public String getOpenaireId() {
return openaireId;
}
public void setOpenaireId(String openaireId) {
this.openaireId = openaireId;
}
private void setSelCriteria(String json, VerbResolver resolver){
log.info("Selection constraints for datasource = " + json);
selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
}
public void setSelCriteria(Node n, VerbResolver resolver){
try{
setSelCriteria(n.getText(),resolver);
}catch(Exception e) {
log.info("not set selection criteria... ");
selectionConstraints =null;
}
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp;
import com.google.gson.Gson;
/**
* Created by miriam on 03/08/2018.
*/
public class Pair<A,B> {
private A fst;
private B snd;
public A getFst() {
return fst;
}
public Pair setFst(A fst) {
this.fst = fst;
return this;
}
public B getSnd() {
return snd;
}
public Pair setSnd(B snd) {
this.snd = snd;
return this;
}
public Pair(A a, B b){
fst = a;
snd = b;
}
public String toJson(){
return new Gson().toJson(this);
}
}

View File

@ -0,0 +1,10 @@
package eu.dnetlib.dhp;
import java.util.HashMap;
public class ProtoMap extends HashMap<String,String> {
public ProtoMap(){
super();
}
}

View File

@ -0,0 +1,66 @@
package eu.dnetlib.dhp;
import com.google.common.base.Joiner;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.dom4j.DocumentException;
import java.util.List;
public class QueryInformationSystem {
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " +
" let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() " +
" let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept " +
" let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept " +
" let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept " +
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
" return " +
" <community> " +
" { $x//CONFIGURATION/context/@id} " +
" <subjects> " +
" {for $y in tokenize($subj,',') " +
" return " +
" <subject>{$y}</subject>} " +
" </subjects> " +
" <datasources> " +
" {for $d in $datasources " +
" where $d/param[./@name='enabled']/text()='true' " +
" return " +
" <datasource> " +
" <openaireId> " +
" {$d//param[./@name='openaireId']/text()} " +
" </openaireId> " +
" <selcriteria> " +
" {$d/param[./@name='selcriteria']/text()} " +
" </selcriteria> " +
" </datasource> } " +
" </datasources> " +
" <zenodocommunities> " +
" {for $zc in $communities " +
" return " +
" <zenodocommunity> " +
" <zenodoid> " +
" {$zc/param[./@name='zenodoid']/text()} " +
" </zenodoid> " +
" <selcriteria> " +
" {$zc/param[./@name='selcriteria']/text()} " +
" </selcriteria> " +
" </zenodocommunity>} " +
" </zenodocommunities> " +
" </community>";
public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl) throws ISLookUpException, DocumentException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
final List<String> res = isLookUp.quickSearchProfile(XQUERY);
final String xmlConf = "<communities>" + Joiner.on(" ").join(res) + "</communities>";
return CommunityConfigurationFactory.newInstance(xmlConf);
}
}

View File

@ -0,0 +1,175 @@
package eu.dnetlib.dhp;
import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static eu.dnetlib.dhp.TagginConstants.*;
/**
* Created by miriam on 02/08/2018.
*/
public class ResultTagger {
private String trust = "0.8";
private boolean clearContext(Result result){
int tmp = result.getContext().size();
List<Context> clist = result.getContext().stream()
.filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR))).collect(Collectors.toList());
result.setContext(clist);
return (tmp != clist.size());
}
private Map<String,List<String>> getParamMap(final Result result, Map<String,String> params) {
Map<String,List<String>> param = new HashMap<>();
String json = new Gson().toJson(result,Result.class);
DocumentContext jsonContext = JsonPath.parse(json);
if (params == null){
params = new HashMap<>();
}
for(String key : params.keySet()) {
try {
param.put(key, jsonContext.read(params.get(key)));
} catch (com.jayway.jsonpath.PathNotFoundException e) {
param.put(key, new ArrayList<>());
// throw e;
}
}
return param;
}
public Result enrichContextCriteria(final Result result, final CommunityConfiguration conf, final Map<String,String> criteria) {
final Map<String, List<String>> param = getParamMap(result, criteria);
//Verify if the entity is deletedbyinference. In case verify if to clean the context list from all the zenodo communities
if(result.getDataInfo().getDeletedbyinference()){
return result;
}
//communities contains all the communities to be added as context for the result
final Set<String> communities = new HashSet<>();
//tagging for Subject
final Set<String> subjects = new HashSet<>();
result.getSubject().stream()
.map(subject -> subject.getValue())
.filter(StringUtils::isNotBlank)
.map(String::toLowerCase)
.map(String::trim)
.collect(Collectors.toCollection(HashSet::new))
.forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s)));
communities.addAll(subjects);
//Tagging for datasource
final Set<String> datasources = new HashSet<>();
final Set<String> tmp = new HashSet<>();
for(Instance i : result.getInstance()){
tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(),"|"));
tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(),"|"));
}
result.getInstance()
.stream()
.map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey()))
.flatMap(p -> Stream.of(p.getFst(), p.getSnd()))
.map(s -> StringUtils.substringAfter(s, "|"))
.collect(Collectors.toCollection(HashSet::new))
.forEach(dsId -> datasources.addAll(conf.getCommunityForDatasource(dsId,param)));
communities.addAll(datasources);
/*Tagging for Zenodo Communities*/
final Set<String> czenodo = new HashSet<>();
//final ResultProtos.Result.Metadata.Builder mBuilder = builder.getEntityBuilder().getResultBuilder().getMetadataBuilder();
result.getContext()
.stream()
.filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR))
.collect(Collectors.toList())
.forEach(c->czenodo.addAll(conf.getCommunityForZenodoCommunityValue(c.getId().substring(c.getId().lastIndexOf("/")+1).trim())));
communities.addAll(czenodo);
clearContext(result);
/*Verify if there is something to bulktag*/
if(communities.isEmpty()){
return result;
}
result.getContext()
.stream()
.map(c -> {
if(communities.contains(c.getId())){
List<DataInfo> dataInfoList = c.getDataInfo();
if (subjects.contains(c.getId()))
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT));
if (datasources.contains(c.getId()))
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE));
if (czenodo.contains(c.getId()))
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO));
}
return c;
})
.collect(Collectors.toList());
communities.removeAll(result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet()));
if(communities.isEmpty())
return result;
List<Context> toaddcontext = communities
.stream()
.map(c -> {
Context context = new Context();
context.setId(c);
List<DataInfo> dataInfoList = Arrays.asList();
if (subjects.contains(c))
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_SUBJECT, CLASS_NAME_BULKTAG_SUBJECT));
if (datasources.contains(c))
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_DATASOURCE, CLASS_NAME_BULKTAG_DATASOURCE));
if (czenodo.contains(c))
dataInfoList.add(getDataInfo(BULKTAG_DATA_INFO_TYPE, CLASS_ID_CZENODO, CLASS_NAME_BULKTAG_ZENODO));
context.setDataInfo(dataInfoList);
return context;
})
.collect(Collectors.toList());
result.getContext().addAll(toaddcontext);
return result;
}
public static DataInfo getDataInfo(String inference_provenance, String inference_class_id, String inference_class_name){
DataInfo di = new DataInfo();
di.setInferred(true);
di.setInferenceprovenance(inference_provenance);
di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name));
return di;
}
public static Qualifier getQualifier(String inference_class_id, String inference_class_name) {
Qualifier pa = new Qualifier();
pa.setClassid(inference_class_id);
pa.setClassname(inference_class_name);
pa.setSchemeid(DNET_SCHEMA_ID);
pa.setSchemename(DNET_SCHEMA_NAME);
return pa;
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import eu.dnetlib.dhp.selectioncriteria.VerbResolver;
import java.io.Serializable;
import java.lang.reflect.Type;
import java.util.Collection;
import java.util.List;
import java.util.Map;
public class SelectionConstraints implements Serializable {
private List<Constraints> criteria;
public SelectionConstraints() {
}
public List<Constraints> getCriteria() {
return criteria;
}
public void setCriteria(List<Constraints> criteria) {
this.criteria = criteria;
}
public void setSc(String json){
Type collectionType = new TypeToken<Collection<Constraints>>(){}.getType();
criteria = new Gson().fromJson(json, collectionType);
}
//Constraints in or
public boolean verifyCriteria(final Map<String, List<String>> param){
for(Constraints selc : criteria) {
if(selc.verifyCriteria(param)){
return true;
}
}
return false;
}
public void setSelection(VerbResolver resolver) {
for(Constraints cs : criteria){
cs.setSelection(resolver);
}
}
}

View File

@ -0,0 +1,71 @@
package eu.dnetlib.dhp;
import java.io.File;
import java.util.Arrays;
import java.util.List;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
public class SparkBulkTagJob {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkBulkTagJob.class.getResourceAsStream("/eu/dnetlib/dhp/input_bulktag_parameters.json")));
parser.parseArgument(args);
final SparkSession spark = SparkSession
.builder()
.appName(SparkBulkTagJob.class.getSimpleName())
.master(parser.get("master"))
.enableHiveSupport()
.getOrCreate();
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final String inputPath = parser.get("sourcePath");
final String outputPath = "/tmp/provision/bulktagging";
final ResultTagger resultTagger = new ResultTagger();
ProtoMap protoMappingParams = new Gson().fromJson(parser.get("mappingProto"),ProtoMap.class);;
File directory = new File(outputPath);
if (!directory.exists()) {
directory.mkdirs();
}
CommunityConfiguration cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookupUrl"));
sc.sequenceFile(inputPath + "/publication", Text.class, Text.class)
.map(item -> new ObjectMapper().readValue(item._2().toString(), Publication.class))
.map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
.map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/publication");
sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class)
.map(item -> new ObjectMapper().readValue(item._2().toString(), Dataset.class))
.map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
.map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/dataset");
sc.sequenceFile(inputPath + "/software", Text.class, Text.class)
.map(item -> new ObjectMapper().readValue(item._2().toString(), Software.class))
.map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
.map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/software");
sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class)
.map(item -> new ObjectMapper().readValue(item._2().toString(), OtherResearchProduct.class))
.map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams))
.map(p -> new ObjectMapper().writeValueAsString(p))
.saveAsTextFile(outputPath+"/otherresearchproduct");
}
}

View File

@ -0,0 +1,27 @@
package eu.dnetlib.dhp;
public class TagginConstants {
public final static String BULKTAG_DATA_INFO_TYPE = "bulktagging";
public final static String DNET_SCHEMA_NAME = "dnet:provenanceActions";
public final static String DNET_SCHEMA_ID = "dnet:provenanceActions";
public final static String CLASS_ID_SUBJECT = "bulktagging:community:subject";
public final static String CLASS_ID_DATASOURCE = "bulktagging:community:datasource";
public final static String CLASS_ID_CZENODO = "bulktagging:community:zenodocommunity";
public final static String SCHEMA_ID = "dnet:provenanceActions";
public final static String COUNTER_GROUP = "Bulk Tagging";
public final static String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public final static String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
public final static String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource";
public final static String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp;
import com.google.gson.Gson;
import org.dom4j.Node;
/**
* Created by miriam on 01/08/2018.
*/
public class ZenodoCommunity {
private String zenodoCommunityId;
private SelectionConstraints selCriteria;
public String getZenodoCommunityId() {
return zenodoCommunityId;
}
public void setZenodoCommunityId(String zenodoCommunityId) {
this.zenodoCommunityId = zenodoCommunityId;
}
public SelectionConstraints getSelCriteria() {
return selCriteria;
}
public void setSelCriteria(SelectionConstraints selCriteria) {
this.selCriteria = selCriteria;
}
private void setSelCriteria(String json){
//Type collectionType = new TypeToken<Collection<Constraints>>(){}.getType();
selCriteria = new Gson().fromJson(json, SelectionConstraints.class);
}
public void setSelCriteria(Node n){
if (n==null){
selCriteria = null;
}else{
setSelCriteria(n.getText());
}
}
}

View File

@ -0,0 +1,27 @@
package eu.dnetlib.dhp.selectioncriteria;
@VerbClass("contains")
public class ContainsVerb implements Selection {
private String param;
public ContainsVerb() {
}
public ContainsVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.contains(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.selectioncriteria;
@VerbClass("equals")
public class EqualVerb implements Selection {
private String param;
public EqualVerb() {
}
public EqualVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.equalsIgnoreCase(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,37 @@
package eu.dnetlib.dhp.selectioncriteria;
import com.google.gson.*;
import java.lang.reflect.Type;
public class InterfaceAdapter implements JsonSerializer, JsonDeserializer {
private static final String CLASSNAME = "CLASSNAME";
private static final String DATA = "DATA";
public Object deserialize(JsonElement jsonElement, Type type,
JsonDeserializationContext jsonDeserializationContext) throws JsonParseException {
JsonObject jsonObject = jsonElement.getAsJsonObject();
JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME);
String className = prim.getAsString();
Class klass = getObjectClass(className);
return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass);
}
public JsonElement serialize(Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) {
JsonObject jsonObject = new JsonObject();
jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName());
jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement));
return jsonObject;
}
/****** Helper method to get the className of the object to be deserialized *****/
public Class getObjectClass(String className) {
try {
return Class.forName(className);
} catch (ClassNotFoundException e) {
//e.printStackTrace();
throw new JsonParseException(e.getMessage());
}
}
}

View File

@ -0,0 +1,27 @@
package eu.dnetlib.dhp.selectioncriteria;
@VerbClass("not_contains")
public class NotContainsVerb implements Selection {
private String param;
public NotContainsVerb() {
}
public NotContainsVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return !value.contains(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.selectioncriteria;
@VerbClass("not_equals")
public class NotEqualVerb implements Selection {
private String param;
public NotEqualVerb(final String param) {
this.param = param;
}
public NotEqualVerb() {
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return !value.equalsIgnoreCase(param);
}
}

View File

@ -0,0 +1,6 @@
package eu.dnetlib.dhp.selectioncriteria;
public interface Selection {
boolean apply(String value);
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.dhp.selectioncriteria;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface VerbClass {
public String value();
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.selectioncriteria;
import org.reflections.Reflections;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
public class VerbResolver implements Serializable {
private final Map<String, Class<Selection>> map;
public VerbResolver(){
this.map = new Reflections("eu.dnetlib").getTypesAnnotatedWith(VerbClass.class).stream()
.collect(Collectors.toMap(v -> v.getAnnotation(VerbClass.class).value(), v->(Class<Selection>)v));
}
public Selection getSelectionCriteria(String name, String param) throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, InstantiationException {
return map.get(name).getDeclaredConstructor((String.class)).newInstance(param);
}
}

View File

@ -0,0 +1,10 @@
package eu.dnetlib.dhp.selectioncriteria;
public class VerbResolverFactory {
public static VerbResolver newInstance(){
return new VerbResolver();
}
}