dnet-hadoop/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/CommunityConfigurationFacto...

128 lines
4.0 KiB
Java
Raw Normal View History

2020-04-30 11:05:17 +02:00
package eu.dnetlib.dhp.bulktag.community;
2020-04-30 11:05:17 +02:00
2020-05-11 17:38:08 +02:00
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
2020-03-03 16:38:50 +01:00
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
2020-05-11 17:38:08 +02:00
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.dhp.bulktag.criteria.InterfaceAdapter;
import eu.dnetlib.dhp.bulktag.criteria.Selection;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolver;
import eu.dnetlib.dhp.bulktag.criteria.VerbResolverFactory;
2020-04-30 11:05:17 +02:00
/** Created by miriam on 03/08/2018. */
2020-03-03 16:38:50 +01:00
public class CommunityConfigurationFactory {
2020-04-30 11:05:17 +02:00
private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class);
2020-03-03 16:38:50 +01:00
2021-05-14 10:58:12 +02:00
private static final VerbResolver resolver = VerbResolverFactory.newInstance();
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
public static CommunityConfiguration newInstance(final String xml) throws DocumentException {
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
log.debug(String.format("parsing community configuration from:\n%s", xml));
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
final Document doc = new SAXReader().read(new StringReader(xml));
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
final Map<String, Community> communities = Maps.newHashMap();
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
for (final Object o : doc.selectNodes("//community")) {
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
final Node node = (Node) o;
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
final Community community = parseCommunity(node);
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
if (community.isValid()) {
communities.put(community.getId(), community);
}
}
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
log.info(String.format("loaded %s community configuration profiles", communities.size()));
2021-05-14 10:58:12 +02:00
log.debug(String.format("loaded community configuration:\n%s", communities));
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
return new CommunityConfiguration(communities);
}
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
public static CommunityConfiguration fromJson(final String json) {
GsonBuilder builder = new GsonBuilder();
builder.registerTypeAdapter(Selection.class, new InterfaceAdapter());
Gson gson = builder.create();
final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class);
log.info(String.format("loaded %s community configuration profiles", conf.size()));
conf.init();
log.info("created inverse maps");
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
return conf;
}
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
private static Community parseCommunity(final Node node) {
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
final Community c = new Community();
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
c.setId(node.valueOf("./@id"));
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
log.info(String.format("community id: %s", c.getId()));
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
c.setSubjects(parseSubjects(node));
c.setProviders(parseDatasources(node));
2020-04-30 11:05:17 +02:00
c.setZenodoCommunities(parseZenodoCommunities(node));
return c;
}
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
private static List<String> parseSubjects(final Node node) {
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
final List<String> subjects = Lists.newArrayList();
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
final List<Node> list = node.selectNodes("./subjects/subject");
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
for (Node n : list) {
log.debug("text of the node " + n.getText());
subjects.add(StringUtils.trim(n.getText()));
}
log.info("size of the subject list " + subjects.size());
return subjects;
}
2020-03-03 16:38:50 +01:00
private static List<Provider> parseDatasources(final Node node) {
2020-04-30 11:05:17 +02:00
final List<Node> list = node.selectNodes("./datasources/datasource");
final List<Provider> providerList = new ArrayList<>();
2020-04-30 11:05:17 +02:00
for (Node n : list) {
Provider d = new Provider();
2020-04-30 11:05:17 +02:00
d.setOpenaireId(n.selectSingleNode("./openaireId").getText());
d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver);
providerList.add(d);
2020-04-30 11:05:17 +02:00
}
log.info("size of the datasource list " + providerList.size());
return providerList;
2020-04-30 11:05:17 +02:00
}
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
private static List<ZenodoCommunity> parseZenodoCommunities(final Node node) {
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
final List<Node> list = node.selectNodes("./zenodocommunities/zenodocommunity");
final List<ZenodoCommunity> zenodoCommunityList = new ArrayList<>();
for (Node n : list) {
ZenodoCommunity zc = new ZenodoCommunity();
zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText());
zc.setSelCriteria(n.selectSingleNode("./selcriteria"));
2020-03-03 16:38:50 +01:00
2020-04-30 11:05:17 +02:00
zenodoCommunityList.add(zc);
}
2020-08-13 18:44:07 +02:00
2020-04-30 11:05:17 +02:00
log.info("size of the zenodo community list " + zenodoCommunityList.size());
return zenodoCommunityList;
}
}