diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index f2ae0ec76..3e520b980 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -43,10 +43,9 @@ public class GraphCleaningFunctions extends CleaningFunctions { private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'"; private static final String[] normalizeDateFormats = { - "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" + "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" }; - public static T fixVocabularyNames(T value) { if (value instanceof Datasource) { // nothing to clean here diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java index 7fc113b6c..cfc3b0c2b 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -5,9 +5,9 @@ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; -import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java index 36689a522..0033978bf 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplication.java @@ -59,9 +59,6 @@ public class SolrAdminApplication implements Closeable { final String zkHost = isLookup.getZkHost(); log.info("zkHost: {}", zkHost); - - - final String collection = ProvisionConstants.getCollectionName(format); log.info("collection: {}", collection); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java index 0b6b5844c..80d0fcd68 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/SolrUtil.java @@ -1,8 +1,24 @@ + package eu.dnetlib.dhp.oa.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.type.MapType; -import com.fasterxml.jackson.databind.type.TypeFactory; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; + +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; + import org.apache.commons.io.IOUtils; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.request.QueryRequest; @@ -21,217 +37,211 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.stringtemplate.v4.ST; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import java.io.BufferedReader; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.HttpURLConnection; -import java.net.URL; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.type.MapType; +import com.fasterxml.jackson.databind.type.TypeFactory; public class SolrUtil { - /** - * The log. - */ - private static final Logger log = LoggerFactory.getLogger(SolrUtil.class); + /** + * The log. + */ + private static final Logger log = LoggerFactory.getLogger(SolrUtil.class); - /** - * The Constant CONFIGS_PATH. - */ - private static final String CONFIGS_PATH = "/configs"; + /** + * The Constant CONFIGS_PATH. + */ + private static final String CONFIGS_PATH = "/configs"; - private static final char DELIMITER = '$'; + private static final char DELIMITER = '$'; - private static final String CONF_BASE_PATH ="/eu/dnetlib/dhp/oa/provision/conf"; + private static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf"; - public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/"; + public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/"; - private static final String SCHEMA_TEMPLATE_PATH= "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt"; + private static final String SCHEMA_TEMPLATE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt"; + private static String createURLRequest = "http://%s:%s/solr/admin/collections?action=CREATE&name=%s&numShards=%s&replicationFactor=%s&maxShardsPerNode=%s&collection.configName=%s"; - private static String createURLRequest = "http://%s:%s/solr/admin/collections?action=CREATE&name=%s&numShards=%s&replicationFactor=%s&maxShardsPerNode=%s&collection.configName=%s"; + private static String generateCreateIndexRequest(final String host, + final String port, + final String collectionName, + final String numShard, + final String replicationFactor, + final String collectionConfigName, + final String maxShardsPerNode) { + return String + .format( + createURLRequest, host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, + collectionConfigName); + } - private static String generateCreateIndexRequest(final String host, - final String port, - final String collectionName, - final String numShard, - final String replicationFactor, - final String collectionConfigName, - final String maxShardsPerNode) { - return String.format(createURLRequest, host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, collectionConfigName); - } + public static boolean createSolrIndex(final String host, + final String port, + final String collectionName, + final String numShard, + final String replicationFactor, + final String maxShardsPerNode, + final String collectionConfigName) throws Exception { - public static boolean createSolrIndex(final String host, - final String port, - final String collectionName, - final String numShard, - final String replicationFactor, - final String maxShardsPerNode, - final String collectionConfigName) throws Exception { + final String uri = generateCreateIndexRequest( + host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, collectionConfigName); - final String uri = generateCreateIndexRequest(host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, collectionConfigName); + URL url = new URL(uri); + System.out.println(uri); - URL url = new URL(uri); - System.out.println(uri); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + int status = connection.getResponseCode(); + System.out.println("status = " + status); - HttpURLConnection connection = (HttpURLConnection) url.openConnection(); - connection.setRequestMethod("GET"); - int status = connection.getResponseCode(); - System.out.println("status = " + status); + BufferedReader in = new BufferedReader( + new InputStreamReader(connection.getInputStream())); + String inputLine; + StringBuffer content = new StringBuffer(); + while ((inputLine = in.readLine()) != null) { + content.append(inputLine); + } + in.close(); - BufferedReader in = new BufferedReader( - new InputStreamReader(connection.getInputStream())); - String inputLine; - StringBuffer content = new StringBuffer(); - while ((inputLine = in.readLine()) != null) { - content.append(inputLine); - } - in.close(); + log.debug("content = " + content); + return true; + } - log.debug("content = " + content); + public static void uploadZookeperConfig(final SolrZkClient zkClient, + final String coreName, + final boolean overwrite, + final String layout) { - return true; - } + final String basepath = CONFIGS_PATH + "/" + coreName; - public static void uploadZookeperConfig(final SolrZkClient zkClient, - final String coreName, - final boolean overwrite, - final String layout){ + log.info("uploading solr configuration to ZK for index collection: " + coreName); + try { + if (overwrite && zkClient.getSolrZooKeeper().exists(basepath, false) != null) { + log.info("cleanup ZK configuration: " + coreName); + for (String child : zkClient.getSolrZooKeeper().getChildren(basepath, false)) { + final String path = basepath + "/" + child; + log.debug("cleanup ZK file: " + path); + zkClient.delete(path, -1, true); + } + zkClient.delete(basepath, -1, true); + } + if (!zkClient.exists(basepath, true)) { + log.info("upload ZK configuration: " + coreName); + zkClient.makePath(basepath, true); + uploadConfiguration(zkClient, basepath, buildConfiguration(layout)); + } + log.info("upload ZK configuration complete"); + } catch (Exception e) { + throw new RuntimeException("unable to upload solr configuration", e); + } + } - final String basepath = CONFIGS_PATH + "/" + coreName; + private static void uploadConfiguration(final SolrZkClient zkClient, final String basePath, + final Map resources) throws KeeperException, + InterruptedException, IOException { - log.info("uploading solr configuration to ZK for index collection: " + coreName); - try { - if (overwrite && zkClient.getSolrZooKeeper().exists(basepath, false) != null) { - log.info("cleanup ZK configuration: " + coreName); - for (String child : zkClient.getSolrZooKeeper().getChildren(basepath, false)) { - final String path = basepath + "/" + child; - log.debug("cleanup ZK file: " + path); - zkClient.delete(path, -1, true); - } - zkClient.delete(basepath, -1, true); - } - if (!zkClient.exists(basepath, true)) { - log.info("upload ZK configuration: " + coreName); - zkClient.makePath(basepath, true); - uploadConfiguration(zkClient, basepath, buildConfiguration(layout)); - } - log.info("upload ZK configuration complete"); - } catch (Exception e) { - throw new RuntimeException("unable to upload solr configuration", e); - } - } + if (!zkClient.exists(basePath, true)) { + zkClient.makePath(basePath, true); + } - private static void uploadConfiguration(final SolrZkClient zkClient, final String basePath, final Map resources) throws KeeperException, - InterruptedException, IOException { + for (final Map.Entry e : resources.entrySet()) { + String path = basePath + "/" + e.getKey(); + log.debug("upload ZK configuration: " + path); + zkClient.create(path, e.getValue(), CreateMode.PERSISTENT, true); + } + } - if (!zkClient.exists(basePath, true)) { - zkClient.makePath(basePath, true); - } + private static String loadFileInClassPath(final String aPath) { + try { + return IOUtils + .toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset()); + } catch (IOException e) { + return null; + } + } - for (final Map.Entry e : resources.entrySet()) { - String path = basePath + "/" + e.getKey(); - log.debug("upload ZK configuration: " + path); - zkClient.create(path, e.getValue(), CreateMode.PERSISTENT, true); - } - } + public static Map getServiceProperties() throws IOException { + final String properties = loadFileInClassPath(CONF_BASE_PATH + "/service_properties.json"); + final ObjectMapper mapper = new ObjectMapper(); + TypeFactory typeFactory = mapper.getTypeFactory(); + MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class); + return mapper.readValue(properties, mapType); + } + public static String getConfig() throws Exception { + final Map p = getServiceProperties(); + final String st = loadFileInClassPath(CONF_BASE_PATH + "/solrconfig.xml.st"); + final ST solrConfig = new ST(st, DELIMITER, DELIMITER); + p.forEach(solrConfig::add); + return solrConfig.render(); + } - private static String loadFileInClassPath(final String aPath) { - try { - return IOUtils.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset()); - } catch (IOException e) { - return null; - } - } + public static NamedList createCollection(CloudSolrClient client, String name, int numShards, + int replicationFactor, int maxShardsPerNode, String configName) throws Exception { + ModifiableSolrParams modParams = new ModifiableSolrParams(); + modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name()); + modParams.set("name", name); + modParams.set("numShards", numShards); + modParams.set("replicationFactor", replicationFactor); + modParams.set("collection.configName", configName); + modParams.set("maxShardsPerNode", maxShardsPerNode); + QueryRequest request = new QueryRequest(modParams); + request.setPath("/admin/collections"); + return client.request(request); + } - public static Map getServiceProperties() throws IOException { - final String properties = loadFileInClassPath(CONF_BASE_PATH+"/service_properties.json"); - final ObjectMapper mapper = new ObjectMapper(); - TypeFactory typeFactory = mapper.getTypeFactory(); - MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class); - return mapper.readValue(properties, mapType); - } + private static Map buildConfiguration(final String layout) + throws Exception { + Map res = new HashMap<>(); - public static String getConfig() throws Exception { - final Map p = getServiceProperties(); - final String st = loadFileInClassPath(CONF_BASE_PATH+"/solrconfig.xml.st"); - final ST solrConfig = new ST(st, DELIMITER, DELIMITER); - p.forEach(solrConfig::add); - return solrConfig.render(); - } + try { + log.debug("adding schema.xml to the resource map"); + res.put("schema.xml", getSchemaXML(layout).getBytes()); - public static NamedList createCollection(CloudSolrClient client, String name, int numShards, - int replicationFactor, int maxShardsPerNode, String configName) throws Exception { - ModifiableSolrParams modParams = new ModifiableSolrParams(); - modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name()); - modParams.set("name", name); - modParams.set("numShards", numShards); - modParams.set("replicationFactor", replicationFactor); - modParams.set("collection.configName", configName); - modParams.set("maxShardsPerNode", maxShardsPerNode); - QueryRequest request = new QueryRequest(modParams); - request.setPath("/admin/collections"); - return client.request(request); - } + res.put("solrconfig.xml", getConfig().getBytes()); + log.debug("adding solrconfig.xml to the resource map"); - private static Map buildConfiguration(final String layout) - throws Exception { + Files + .list( + Paths.get(Objects.requireNonNull(SolrUtil.class.getResource(CONF_FILE_BASE_PATH)).getPath())) + .map(Path::getFileName) + .forEach(s -> { + log.debug(String.format("put file from path %s", CONF_FILE_BASE_PATH + s)); + res + .put( + String.valueOf(s), + Objects + .requireNonNull(loadFileInClassPath(CONF_FILE_BASE_PATH + s)) + .getBytes(StandardCharsets.UTF_8)); + }); - Map res = new HashMap<>(); + return res; + } catch (Throwable e) { + throw new Exception("failed to build configuration", e); + } + } - try { - log.debug("adding schema.xml to the resource map"); - res.put("schema.xml", getSchemaXML(layout).getBytes()); + public static String getSchemaXML(final String layout) throws Exception { - res.put("solrconfig.xml", getConfig().getBytes()); - log.debug("adding solrconfig.xml to the resource map"); + final Document fields = new SAXReader().read(new ByteArrayInputStream(layout.getBytes(StandardCharsets.UTF_8))); - Files.list( - Paths.get(Objects.requireNonNull(SolrUtil.class.getResource(CONF_FILE_BASE_PATH)).getPath())) - .map(Path::getFileName) - .forEach(s-> { - log.debug(String.format("put file from path %s",CONF_FILE_BASE_PATH + s)); - res.put(String.valueOf(s), + Transformer transformer = TransformerFactory + .newInstance() + .newTransformer( + new DocumentSource(new SAXReader().read(SolrUtil.class.getResourceAsStream(SCHEMA_TEMPLATE_PATH)))); + transformer.setParameter("textFieldType", "text_common"); - Objects.requireNonNull(loadFileInClassPath(CONF_FILE_BASE_PATH + s)).getBytes(StandardCharsets.UTF_8));} - ); + final DocumentResult result = new DocumentResult(); - return res; - } catch (Throwable e) { - throw new Exception("failed to build configuration", e); - } - } + transformer.transform(new DocumentSource(fields), result); + String xml = result.getDocument().asXML(); + log.debug("new index schema:\n" + xml); - public static String getSchemaXML(final String layout) throws Exception { - - final Document fields = new SAXReader().read(new ByteArrayInputStream(layout.getBytes(StandardCharsets.UTF_8))); - - - Transformer transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource(new SAXReader().read(SolrUtil.class.getResourceAsStream(SCHEMA_TEMPLATE_PATH)))); - transformer.setParameter("textFieldType", "text_common"); - - final DocumentResult result = new DocumentResult(); - - transformer.transform(new DocumentSource(fields), result); - String xml = result.getDocument().asXML(); - - log.debug("new index schema:\n" + xml); - - return xml; - } + return xml; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/scholix/ScholixToSolr.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/scholix/ScholixToSolr.java index 1c536d30b..7708242e3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/scholix/ScholixToSolr.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/scholix/ScholixToSolr.java @@ -1,10 +1,5 @@ -package eu.dnetlib.dhp.oa.provision.scholix; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; -import eu.dnetlib.dhp.schema.sx.scholix.*; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrInputDocument; +package eu.dnetlib.dhp.oa.provision.scholix; import java.io.IOException; import java.time.LocalDate; @@ -12,95 +7,111 @@ import java.util.List; import java.util.Objects; import java.util.stream.Collectors; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrInputDocument; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; +import eu.dnetlib.dhp.schema.sx.scholix.*; + public class ScholixToSolr { - final static ObjectMapper MAPPER = new ObjectMapper(); + final static ObjectMapper MAPPER = new ObjectMapper(); + public static SolrInputDocument toSolrDocument(final String json) { + try { + final Scholix input = MAPPER.readValue(json, Scholix.class); + final SolrInputDocument output = new SolrInputDocument(); - public static SolrInputDocument toSolrDocument(final String json) { - try { - final Scholix input = MAPPER.readValue(json, Scholix.class); - final SolrInputDocument output = new SolrInputDocument(); + fillEntityField(output, input.getSource(), "source"); + fillEntityField(output, input.getTarget(), "target"); + final String cleanDate = GraphCleaningFunctions.cleanDate(input.getPublicationDate()); - fillEntityField(output,input.getSource(), "source"); - fillEntityField(output,input.getTarget(), "target"); - final String cleanDate= GraphCleaningFunctions.cleanDate(input.getPublicationDate()); + if (cleanDate != null) + output.addField("publication_date", GraphCleaningFunctions.normalizeDate(cleanDate)); - if(cleanDate!= null) - output.addField("publication_date", GraphCleaningFunctions.normalizeDate(cleanDate)); + if (input.getRelationship() != null && input.getRelationship().getName() != null) + output.addField("relation_name", input.getRelationship().getName()); + else + return null; + if (input.getRelationship() != null && input.getRelationship().getInverse() != null) + output.addField("relation_inverse", input.getRelationship().getInverse()); - if (input.getRelationship()!= null && input.getRelationship().getName()!= null) - output.addField("relation_name", input.getRelationship().getName()); - else - return null; - if (input.getRelationship()!= null && input.getRelationship().getInverse()!= null) - output.addField("relation_inverse", input.getRelationship().getInverse()); + if (input.getLinkprovider() != null) { + final List linkProviders = input + .getLinkprovider() + .stream() + .map(ScholixEntityId::getName) + .filter(Objects::nonNull) + .collect(Collectors.toList()); - if (input.getLinkprovider()!= null) { - final List linkProviders = input.getLinkprovider().stream() - .map(ScholixEntityId::getName) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + output.addField("link_provider", linkProviders); + } + if (input.getPublisher() != null) { + final List publishers = input + .getPublisher() + .stream() + .map(ScholixEntityId::getName) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + output.addField("publisher_name", publishers); + } - output.addField("link_provider",linkProviders); - } - if(input.getPublisher()!= null) { - final List publishers = input.getPublisher().stream() - .map(ScholixEntityId::getName) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - output.addField("publisher_name", publishers); - } + output.addField("__indexrecordidentifier", input.getIdentifier()); + output.addField("__result", json); + return output; - output.addField("__indexrecordidentifier", input.getIdentifier()); - output.addField("__result", json); - return output; + } catch (IOException e) { + throw new RuntimeException("Error on convert Scholix"); + } + } - } catch (IOException e) { - throw new RuntimeException("Error on convert Scholix"); - } - } + private static void fillEntityField(final SolrInputDocument document, final ScholixResource resource, + final String prefix) { + document.addField(prefix + "_identifier", resource.getDnetIdentifier()); + document.addField(prefix + "_type", resource.getObjectType()); + document.addField(prefix + "_publication_date", resource.getPublicationDate()); + document.addField(prefix + "_subtype", resource.getObjectSubType()); - private static void fillEntityField(final SolrInputDocument document, final ScholixResource resource, final String prefix) { + List resourcePIDs = resource + .getIdentifier() + .stream() + .map(ScholixIdentifier::getIdentifier) + .collect(Collectors.toList()); + document.addField(prefix + "_pid", resourcePIDs); - document.addField(prefix+"_identifier",resource.getDnetIdentifier()); - document.addField(prefix+"_type", resource.getObjectType()); - document.addField(prefix+"_publication_date", resource.getPublicationDate()); - document.addField(prefix+"_subtype", resource.getObjectSubType()); + List resourceSchemas = resource + .getIdentifier() + .stream() + .map(ScholixIdentifier::getSchema) + .collect(Collectors.toList()); + document.addField(prefix + "_schema", resourceSchemas); + if (resource.getPublisher() != null) { - List resourcePIDs = resource.getIdentifier().stream() - .map(ScholixIdentifier::getIdentifier) - .collect(Collectors.toList()); - document.addField(prefix+"_pid", resourcePIDs); + final List publishers = resource + .getPublisher() + .stream() + .map(ScholixEntityId::getName) + .collect(Collectors.toList()); + if (publishers.size() > 0) + document.addField(prefix + "_publisher", publishers); + } - List resourceSchemas = resource.getIdentifier().stream() - .map(ScholixIdentifier::getSchema) - .collect(Collectors.toList()); - document.addField(prefix+"_schema", resourceSchemas); - - - if (resource.getPublisher() != null) { - - final List publishers = resource.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList()); - if (publishers.size()>0) - document.addField(prefix+"_publisher", publishers); - } - - - if (resource.getCollectedFrom() != null) { - - final List collectedFrom = resource.getCollectedFrom().stream() - .map(ScholixCollectedFrom::getProvider) - .filter(Objects::nonNull) - .map(ScholixEntityId::getName) - .collect(Collectors.toList()); - if (collectedFrom.size()>0) - document.addField(prefix+"_collected_from", collectedFrom); - } - - } + if (resource.getCollectedFrom() != null) { + final List collectedFrom = resource + .getCollectedFrom() + .stream() + .map(ScholixCollectedFrom::getProvider) + .filter(Objects::nonNull) + .map(ScholixEntityId::getName) + .collect(Collectors.toList()); + if (collectedFrom.size() > 0) + document.addField(prefix + "_collected_from", collectedFrom); + } + } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java index cbc857f42..45a3642f3 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java @@ -1,6 +1,17 @@ + package eu.dnetlib.dhp.oa.provision; -import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr; +import static org.junit.jupiter.api.Assertions.*; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.solr.client.solrj.SolrServerException; @@ -17,176 +28,157 @@ import org.dom4j.Node; import org.dom4j.io.SAXReader; import org.junit.jupiter.api.*; -import java.io.*; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; -import java.util.stream.Collectors; -import java.util.zip.GZIPInputStream; - -import static org.junit.jupiter.api.Assertions.*; +import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr; @TestMethodOrder(MethodOrderer.OrderAnnotation.class) -public class ScholixIndexingTest extends SolrTest{ +public class ScholixIndexingTest extends SolrTest { - private static String LAYOUT_PATH="/eu/dnetlib/dhp/oa/provision/SMF_layout.xml"; + private static String LAYOUT_PATH = "/eu/dnetlib/dhp/oa/provision/SMF_layout.xml"; + /** + * This test verifies that the schema will be generated correctly + * by get the profile of the metadataFormat and generating solr schema.xml + * we expect that the fiedl in the metadataFormat are all in the field solr schema + * @throws Exception + */ + @Test + @Order(1) + void testSchemaCreation() throws Exception { - /** - * This test verifies that the schema will be generated correctly - * by get the profile of the metadataFormat and generating solr schema.xml - * we expect that the fiedl in the metadataFormat are all in the field solr schema - * @throws Exception - */ - @Test - @Order(1) - void testSchemaCreation() throws Exception { + final String layout = loadSMFLayout(); + assertNotNull(layout); + assertTrue(StringUtils.isNotBlank(layout)); - final String layout = loadSMFLayout(); - assertNotNull(layout); - assertTrue(StringUtils.isNotBlank(layout)); + final String scheme = SolrUtil.getSchemaXML(loadSMFLayout()); + assertNotNull(scheme); + assertTrue(StringUtils.isNotBlank(scheme)); - final String scheme = SolrUtil.getSchemaXML(loadSMFLayout()); - assertNotNull(scheme); - assertTrue(StringUtils.isNotBlank(scheme)); + final Document fields = parseDocument(layout); + List params = fields.selectNodes("//FIELD"); + final List exptectedFieldName = new ArrayList<>(); + for (Node param : params) { + Element element = (Element) param; + String name = element.attributeValue("name"); + exptectedFieldName.add(name.toLowerCase()); + } + assertTrue(exptectedFieldName.size() > 0); + final Document parsedScheme = parseDocument(scheme); + params = parsedScheme.selectNodes("//field"); + final List createdFieldName = new ArrayList<>(); + for (Node param : params) { - final Document fields = parseDocument(layout); - List params = fields.selectNodes("//FIELD"); - final List exptectedFieldName = new ArrayList<>(); - for (Node param : params) { - Element element = (Element) param; - String name = element.attributeValue("name"); - exptectedFieldName.add(name.toLowerCase()); - } - assertTrue(exptectedFieldName.size()>0); + Element element = (Element) param; + String name = element.attributeValue("name"); + createdFieldName.add(name.toLowerCase()); + } + assertTrue(createdFieldName.size() > 0); + exptectedFieldName.stream().map(createdFieldName::contains).forEach(Assertions::assertTrue); + } - final Document parsedScheme = parseDocument(scheme); - params = parsedScheme.selectNodes("//field"); - final List createdFieldName = new ArrayList<>(); - for (Node param : params) { + /*** + * Test the creation of the index works + * we test if all the files are uploaded into + * the zookeeper instance of SOLR under it's + * collection name + * @throws Exception + */ + @Test + @Order(2) + public void testCreateCollection() throws Exception { + final String collectionName = "SMF-index-scholix"; + SolrUtil.uploadZookeperConfig(miniCluster.getZkClient(), collectionName, true, loadSMFLayout()); - Element element = (Element) param; - String name = element.attributeValue("name"); - createdFieldName.add(name.toLowerCase()); - } - assertTrue(createdFieldName.size()>0); + assertTrue(miniCluster.getZkClient().exists("/configs/" + collectionName, true)); + List items = miniCluster.getZkClient().getChildren("/configs/" + collectionName, null, true); - exptectedFieldName.stream().map(createdFieldName::contains).forEach(Assertions::assertTrue); - } + List configurationFiles = Files + .list( + Paths + .get( + Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_FILE_BASE_PATH)).getPath())) + .map(Path::getFileName) + .map(Path::toString) + .collect(Collectors.toList()); + configurationFiles.add("schema.xml"); + configurationFiles.add("solrconfig.xml"); + configurationFiles.forEach(s -> assertTrue(items.contains(s))); - /*** - * Test the creation of the index works - * we test if all the files are uploaded into - * the zookeeper instance of SOLR under it's - * collection name - * @throws Exception - */ - @Test - @Order(2) - public void testCreateCollection() throws Exception { - final String collectionName ="SMF-index-scholix"; - SolrUtil.uploadZookeperConfig(miniCluster.getZkClient(),collectionName,true, loadSMFLayout() ); + SolrUtil.createCollection(miniCluster.getSolrClient(), "Scholix", 4, 1, 2, collectionName); - assertTrue(miniCluster.getZkClient().exists("/configs/"+collectionName, true)); - List items = miniCluster.getZkClient().getChildren("/configs/"+collectionName, null, true); + log.debug("Collection Created"); + final Map queryParamMap = new HashMap<>(); + queryParamMap.put("q", "*:*"); - List configurationFiles = - Files.list( - Paths.get( - Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_FILE_BASE_PATH)).getPath())) - .map(Path::getFileName) - .map(Path::toString) - .collect(Collectors.toList()); - configurationFiles.add("schema.xml"); - configurationFiles.add("solrconfig.xml"); - configurationFiles.forEach(s->assertTrue(items.contains(s))); + MapSolrParams queryParams = new MapSolrParams(queryParamMap); + final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams); + final SolrDocumentList documents = response.getResults(); + assertEquals(0, documents.getNumFound()); - SolrUtil.createCollection(miniCluster.getSolrClient(), "Scholix", 4,1,2,collectionName); + } - log.debug("Collection Created"); - final Map queryParamMap = new HashMap<>(); - queryParamMap.put("q", "*:*"); + @Test + @Order(3) + public void testFeedingSolrDocument() throws Exception { - MapSolrParams queryParams = new MapSolrParams(queryParamMap); - final QueryResponse response =miniCluster.getSolrClient().query("Scholix", queryParams); - final SolrDocumentList documents = response.getResults(); - assertEquals(0, documents.getNumFound()); + InputStream gzipStream = new GZIPInputStream( + Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/provision/scholix_records.gz"))); + Reader decoder = new InputStreamReader(gzipStream, StandardCharsets.UTF_8); + BufferedReader buffered = new BufferedReader(decoder); + String line = buffered.readLine(); - } + final CloudSolrClient client = miniCluster.getSolrClient(); + client.setDefaultCollection("Scholix"); + int added = 0; + while (line != null) { + final SolrInputDocument solrDocument = ScholixToSolr.toSolrDocument(line); - @Test - @Order(3) - public void testFeedingSolrDocument() throws Exception { + client.add(solrDocument); + added++; + line = buffered.readLine(); + } + client.commit(); - InputStream gzipStream = new GZIPInputStream(Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/provision/scholix_records.gz"))); - Reader decoder = new InputStreamReader(gzipStream, StandardCharsets.UTF_8); - BufferedReader buffered = new BufferedReader(decoder); - String line = buffered.readLine(); + log.debug(String.format("Feed %d documents", added)); - final CloudSolrClient client = miniCluster.getSolrClient(); - client.setDefaultCollection("Scholix"); - int added = 0; - while (line!= null) { + final SolrDocumentList documents = executeQuery("*:*"); + assertEquals(added, documents.getNumFound()); + documents.stream().map(s -> s.getFirstValue("source_pid").toString()).forEach(System.out::println); - final SolrInputDocument solrDocument = ScholixToSolr.toSolrDocument(line); + SolrDocumentList source_pids = executeQuery("source_pid:\"10.15468/dl.u47azs\""); - client.add(solrDocument); - added ++; - line = buffered.readLine(); - } + System.out.println("source_pid.getNumFound() = " + source_pids.getNumFound()); - client.commit(); + source_pids.stream().map(s -> s.getFieldValue("source_pid")).forEach(System.out::println); - log.debug(String.format("Feed %d documents",added)); + } + private SolrDocumentList executeQuery(final String query) throws SolrServerException, IOException { - final SolrDocumentList documents = executeQuery("*:*"); - assertEquals(added, documents.getNumFound()); + final Map queryParamMap = new HashMap<>(); + queryParamMap.put("q", query); + MapSolrParams queryParams = new MapSolrParams(queryParamMap); + final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams); + return response.getResults(); + } + /*** + * Utility for parsing XML + * @param xml + * @return Dom4J Document + * @throws DocumentException + */ + private Document parseDocument(final String xml) throws DocumentException { + return new SAXReader().read(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); + } - documents.stream().map(s-> s.getFirstValue("source_pid").toString()).forEach(System.out::println); - - SolrDocumentList source_pids = executeQuery("source_pid:\"10.15468/dl.u47azs\""); - - System.out.println("source_pid.getNumFound() = " + source_pids.getNumFound()); - - source_pids.stream().map(s -> s.getFieldValue("source_pid")).forEach(System.out::println); - - } - - - private SolrDocumentList executeQuery(final String query) throws SolrServerException, IOException { - - final Map queryParamMap = new HashMap<>(); - queryParamMap.put("q", query); - - MapSolrParams queryParams = new MapSolrParams(queryParamMap); - final QueryResponse response =miniCluster.getSolrClient().query("Scholix", queryParams); - return response.getResults(); - } - - /*** - * Utility for parsing XML - * @param xml - * @return Dom4J Document - * @throws DocumentException - */ - private Document parseDocument(final String xml) throws DocumentException { - return new SAXReader().read(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); - } - - private String loadSMFLayout() throws IOException { - return IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(LAYOUT_PATH))); - } - - + private String loadSMFLayout() throws IOException { + return IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(LAYOUT_PATH))); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java index 994ce2ac1..3e8a35fe1 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/SolrAdminApplicationTest.java @@ -1,12 +1,12 @@ package eu.dnetlib.dhp.oa.provision; +import static org.junit.jupiter.api.Assertions.assertEquals; + import org.apache.solr.client.solrj.response.SolrPingResponse; import org.apache.solr.client.solrj.response.UpdateResponse; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; - class SolrAdminApplicationTest extends SolrTest { @Test diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 541d59007..1fae68da6 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -352,7 +352,9 @@ + + @@ -427,9 +429,12 @@ ssh + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} -o StrictHostKeyChecking=no + rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/ @@ -443,9 +448,11 @@ scp + -P ${dhp.hadoop.frontend.port.ssh} -o StrictHostKeyChecking=no target/${oozie.package.file.name}.tar.gz + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz @@ -460,11 +467,15 @@ ssh + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} -o StrictHostKeyChecking=no + cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; tar -zxf oozie-package.tar.gz; + rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; ./upload_workflow.sh @@ -495,9 +506,12 @@ ${oozie.execution.log.file.location} + ${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name} + -p ${dhp.hadoop.frontend.port.ssh} -o StrictHostKeyChecking=no + cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; ./run_workflow.sh @@ -512,6 +526,7 @@ cat + ${oozie.execution.log.file.location}