code refactor

This commit is contained in:
Sandro La Bruzzo 2022-10-06 09:12:14 +02:00
parent bf6c8ccc79
commit 6d5cda1a03
8 changed files with 422 additions and 398 deletions

View File

@ -43,10 +43,9 @@ public class GraphCleaningFunctions extends CleaningFunctions {
private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'"; private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
private static final String[] normalizeDateFormats = { private static final String[] normalizeDateFormats = {
"yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy"
}; };
public static <T extends Oaf> T fixVocabularyNames(T value) { public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here

View File

@ -5,9 +5,9 @@ import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import net.sf.saxon.expr.XPathContext; import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Sequence; import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException; import net.sf.saxon.trans.XPathException;

View File

@ -59,9 +59,6 @@ public class SolrAdminApplication implements Closeable {
final String zkHost = isLookup.getZkHost(); final String zkHost = isLookup.getZkHost();
log.info("zkHost: {}", zkHost); log.info("zkHost: {}", zkHost);
final String collection = ProvisionConstants.getCollectionName(format); final String collection = ProvisionConstants.getCollectionName(format);
log.info("collection: {}", collection); log.info("collection: {}", collection);

View File

@ -1,8 +1,24 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.BufferedReader;
import com.fasterxml.jackson.databind.type.MapType; import java.io.ByteArrayInputStream;
import com.fasterxml.jackson.databind.type.TypeFactory; import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.client.solrj.request.QueryRequest;
@ -21,217 +37,211 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.stringtemplate.v4.ST; import org.stringtemplate.v4.ST;
import javax.xml.transform.Transformer; import com.fasterxml.jackson.databind.ObjectMapper;
import javax.xml.transform.TransformerFactory; import com.fasterxml.jackson.databind.type.MapType;
import java.io.BufferedReader; import com.fasterxml.jackson.databind.type.TypeFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
public class SolrUtil { public class SolrUtil {
/** /**
* The log. * The log.
*/ */
private static final Logger log = LoggerFactory.getLogger(SolrUtil.class); private static final Logger log = LoggerFactory.getLogger(SolrUtil.class);
/** /**
* The Constant CONFIGS_PATH. * The Constant CONFIGS_PATH.
*/ */
private static final String CONFIGS_PATH = "/configs"; private static final String CONFIGS_PATH = "/configs";
private static final char DELIMITER = '$'; private static final char DELIMITER = '$';
private static final String CONF_BASE_PATH ="/eu/dnetlib/dhp/oa/provision/conf"; private static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf";
public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/"; public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/";
private static final String SCHEMA_TEMPLATE_PATH= "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt"; private static final String SCHEMA_TEMPLATE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt";
private static String createURLRequest = "http://%s:%s/solr/admin/collections?action=CREATE&name=%s&numShards=%s&replicationFactor=%s&maxShardsPerNode=%s&collection.configName=%s";
private static String createURLRequest = "http://%s:%s/solr/admin/collections?action=CREATE&name=%s&numShards=%s&replicationFactor=%s&maxShardsPerNode=%s&collection.configName=%s"; private static String generateCreateIndexRequest(final String host,
final String port,
final String collectionName,
final String numShard,
final String replicationFactor,
final String collectionConfigName,
final String maxShardsPerNode) {
return String
.format(
createURLRequest, host, port, collectionName, numShard, replicationFactor, maxShardsPerNode,
collectionConfigName);
}
private static String generateCreateIndexRequest(final String host, public static boolean createSolrIndex(final String host,
final String port, final String port,
final String collectionName, final String collectionName,
final String numShard, final String numShard,
final String replicationFactor, final String replicationFactor,
final String collectionConfigName, final String maxShardsPerNode,
final String maxShardsPerNode) { final String collectionConfigName) throws Exception {
return String.format(createURLRequest, host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, collectionConfigName);
}
public static boolean createSolrIndex(final String host, final String uri = generateCreateIndexRequest(
final String port, host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, collectionConfigName);
final String collectionName,
final String numShard,
final String replicationFactor,
final String maxShardsPerNode,
final String collectionConfigName) throws Exception {
final String uri = generateCreateIndexRequest(host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, collectionConfigName); URL url = new URL(uri);
System.out.println(uri);
URL url = new URL(uri); HttpURLConnection connection = (HttpURLConnection) url.openConnection();
System.out.println(uri); connection.setRequestMethod("GET");
int status = connection.getResponseCode();
System.out.println("status = " + status);
HttpURLConnection connection = (HttpURLConnection) url.openConnection(); BufferedReader in = new BufferedReader(
connection.setRequestMethod("GET"); new InputStreamReader(connection.getInputStream()));
int status = connection.getResponseCode(); String inputLine;
System.out.println("status = " + status); StringBuffer content = new StringBuffer();
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
in.close();
BufferedReader in = new BufferedReader( log.debug("content = " + content);
new InputStreamReader(connection.getInputStream()));
String inputLine;
StringBuffer content = new StringBuffer();
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
in.close();
return true;
}
log.debug("content = " + content); public static void uploadZookeperConfig(final SolrZkClient zkClient,
final String coreName,
final boolean overwrite,
final String layout) {
return true; final String basepath = CONFIGS_PATH + "/" + coreName;
}
public static void uploadZookeperConfig(final SolrZkClient zkClient, log.info("uploading solr configuration to ZK for index collection: " + coreName);
final String coreName, try {
final boolean overwrite, if (overwrite && zkClient.getSolrZooKeeper().exists(basepath, false) != null) {
final String layout){ log.info("cleanup ZK configuration: " + coreName);
for (String child : zkClient.getSolrZooKeeper().getChildren(basepath, false)) {
final String path = basepath + "/" + child;
log.debug("cleanup ZK file: " + path);
zkClient.delete(path, -1, true);
}
zkClient.delete(basepath, -1, true);
}
if (!zkClient.exists(basepath, true)) {
log.info("upload ZK configuration: " + coreName);
zkClient.makePath(basepath, true);
uploadConfiguration(zkClient, basepath, buildConfiguration(layout));
}
log.info("upload ZK configuration complete");
} catch (Exception e) {
throw new RuntimeException("unable to upload solr configuration", e);
}
}
final String basepath = CONFIGS_PATH + "/" + coreName; private static void uploadConfiguration(final SolrZkClient zkClient, final String basePath,
final Map<String, byte[]> resources) throws KeeperException,
InterruptedException, IOException {
log.info("uploading solr configuration to ZK for index collection: " + coreName); if (!zkClient.exists(basePath, true)) {
try { zkClient.makePath(basePath, true);
if (overwrite && zkClient.getSolrZooKeeper().exists(basepath, false) != null) { }
log.info("cleanup ZK configuration: " + coreName);
for (String child : zkClient.getSolrZooKeeper().getChildren(basepath, false)) {
final String path = basepath + "/" + child;
log.debug("cleanup ZK file: " + path);
zkClient.delete(path, -1, true);
}
zkClient.delete(basepath, -1, true);
}
if (!zkClient.exists(basepath, true)) {
log.info("upload ZK configuration: " + coreName);
zkClient.makePath(basepath, true);
uploadConfiguration(zkClient, basepath, buildConfiguration(layout));
}
log.info("upload ZK configuration complete");
} catch (Exception e) {
throw new RuntimeException("unable to upload solr configuration", e);
}
}
private static void uploadConfiguration(final SolrZkClient zkClient, final String basePath, final Map<String, byte[]> resources) throws KeeperException, for (final Map.Entry<String, byte[]> e : resources.entrySet()) {
InterruptedException, IOException { String path = basePath + "/" + e.getKey();
log.debug("upload ZK configuration: " + path);
zkClient.create(path, e.getValue(), CreateMode.PERSISTENT, true);
}
}
if (!zkClient.exists(basePath, true)) { private static String loadFileInClassPath(final String aPath) {
zkClient.makePath(basePath, true); try {
} return IOUtils
.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset());
} catch (IOException e) {
return null;
}
}
for (final Map.Entry<String, byte[]> e : resources.entrySet()) { public static Map<String, String> getServiceProperties() throws IOException {
String path = basePath + "/" + e.getKey(); final String properties = loadFileInClassPath(CONF_BASE_PATH + "/service_properties.json");
log.debug("upload ZK configuration: " + path); final ObjectMapper mapper = new ObjectMapper();
zkClient.create(path, e.getValue(), CreateMode.PERSISTENT, true); TypeFactory typeFactory = mapper.getTypeFactory();
} MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class);
} return mapper.readValue(properties, mapType);
}
public static String getConfig() throws Exception {
final Map<String, String> p = getServiceProperties();
final String st = loadFileInClassPath(CONF_BASE_PATH + "/solrconfig.xml.st");
final ST solrConfig = new ST(st, DELIMITER, DELIMITER);
p.forEach(solrConfig::add);
return solrConfig.render();
}
private static String loadFileInClassPath(final String aPath) { public static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
try { int replicationFactor, int maxShardsPerNode, String configName) throws Exception {
return IOUtils.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset()); ModifiableSolrParams modParams = new ModifiableSolrParams();
} catch (IOException e) { modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
return null; modParams.set("name", name);
} modParams.set("numShards", numShards);
} modParams.set("replicationFactor", replicationFactor);
modParams.set("collection.configName", configName);
modParams.set("maxShardsPerNode", maxShardsPerNode);
QueryRequest request = new QueryRequest(modParams);
request.setPath("/admin/collections");
return client.request(request);
}
public static Map<String,String> getServiceProperties() throws IOException { private static Map<String, byte[]> buildConfiguration(final String layout)
final String properties = loadFileInClassPath(CONF_BASE_PATH+"/service_properties.json"); throws Exception {
final ObjectMapper mapper = new ObjectMapper();
TypeFactory typeFactory = mapper.getTypeFactory();
MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class);
return mapper.readValue(properties, mapType);
}
Map<String, byte[]> res = new HashMap<>();
public static String getConfig() throws Exception { try {
final Map<String, String> p = getServiceProperties(); log.debug("adding schema.xml to the resource map");
final String st = loadFileInClassPath(CONF_BASE_PATH+"/solrconfig.xml.st"); res.put("schema.xml", getSchemaXML(layout).getBytes());
final ST solrConfig = new ST(st, DELIMITER, DELIMITER);
p.forEach(solrConfig::add);
return solrConfig.render();
}
public static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards, res.put("solrconfig.xml", getConfig().getBytes());
int replicationFactor, int maxShardsPerNode, String configName) throws Exception { log.debug("adding solrconfig.xml to the resource map");
ModifiableSolrParams modParams = new ModifiableSolrParams();
modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
modParams.set("name", name);
modParams.set("numShards", numShards);
modParams.set("replicationFactor", replicationFactor);
modParams.set("collection.configName", configName);
modParams.set("maxShardsPerNode", maxShardsPerNode);
QueryRequest request = new QueryRequest(modParams);
request.setPath("/admin/collections");
return client.request(request);
}
private static Map<String, byte[]> buildConfiguration(final String layout) Files
throws Exception { .list(
Paths.get(Objects.requireNonNull(SolrUtil.class.getResource(CONF_FILE_BASE_PATH)).getPath()))
.map(Path::getFileName)
.forEach(s -> {
log.debug(String.format("put file from path %s", CONF_FILE_BASE_PATH + s));
res
.put(
String.valueOf(s),
Objects
.requireNonNull(loadFileInClassPath(CONF_FILE_BASE_PATH + s))
.getBytes(StandardCharsets.UTF_8));
});
Map<String, byte[]> res = new HashMap<>(); return res;
} catch (Throwable e) {
throw new Exception("failed to build configuration", e);
}
}
try { public static String getSchemaXML(final String layout) throws Exception {
log.debug("adding schema.xml to the resource map");
res.put("schema.xml", getSchemaXML(layout).getBytes());
res.put("solrconfig.xml", getConfig().getBytes()); final Document fields = new SAXReader().read(new ByteArrayInputStream(layout.getBytes(StandardCharsets.UTF_8)));
log.debug("adding solrconfig.xml to the resource map");
Files.list( Transformer transformer = TransformerFactory
Paths.get(Objects.requireNonNull(SolrUtil.class.getResource(CONF_FILE_BASE_PATH)).getPath())) .newInstance()
.map(Path::getFileName) .newTransformer(
.forEach(s-> { new DocumentSource(new SAXReader().read(SolrUtil.class.getResourceAsStream(SCHEMA_TEMPLATE_PATH))));
log.debug(String.format("put file from path %s",CONF_FILE_BASE_PATH + s)); transformer.setParameter("textFieldType", "text_common");
res.put(String.valueOf(s),
Objects.requireNonNull(loadFileInClassPath(CONF_FILE_BASE_PATH + s)).getBytes(StandardCharsets.UTF_8));} final DocumentResult result = new DocumentResult();
);
return res; transformer.transform(new DocumentSource(fields), result);
} catch (Throwable e) { String xml = result.getDocument().asXML();
throw new Exception("failed to build configuration", e);
}
}
log.debug("new index schema:\n" + xml);
public static String getSchemaXML(final String layout) throws Exception { return xml;
}
final Document fields = new SAXReader().read(new ByteArrayInputStream(layout.getBytes(StandardCharsets.UTF_8)));
Transformer transformer = TransformerFactory.newInstance().newTransformer(new DocumentSource(new SAXReader().read(SolrUtil.class.getResourceAsStream(SCHEMA_TEMPLATE_PATH))));
transformer.setParameter("textFieldType", "text_common");
final DocumentResult result = new DocumentResult();
transformer.transform(new DocumentSource(fields), result);
String xml = result.getDocument().asXML();
log.debug("new index schema:\n" + xml);
return xml;
}
} }

View File

@ -1,10 +1,5 @@
package eu.dnetlib.dhp.oa.provision.scholix;
import com.fasterxml.jackson.databind.ObjectMapper; package eu.dnetlib.dhp.oa.provision.scholix;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.schema.sx.scholix.*;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import java.io.IOException; import java.io.IOException;
import java.time.LocalDate; import java.time.LocalDate;
@ -12,95 +7,111 @@ import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.schema.sx.scholix.*;
public class ScholixToSolr { public class ScholixToSolr {
final static ObjectMapper MAPPER = new ObjectMapper(); final static ObjectMapper MAPPER = new ObjectMapper();
public static SolrInputDocument toSolrDocument(final String json) {
try {
final Scholix input = MAPPER.readValue(json, Scholix.class);
final SolrInputDocument output = new SolrInputDocument();
public static SolrInputDocument toSolrDocument(final String json) { fillEntityField(output, input.getSource(), "source");
try { fillEntityField(output, input.getTarget(), "target");
final Scholix input = MAPPER.readValue(json, Scholix.class); final String cleanDate = GraphCleaningFunctions.cleanDate(input.getPublicationDate());
final SolrInputDocument output = new SolrInputDocument();
fillEntityField(output,input.getSource(), "source"); if (cleanDate != null)
fillEntityField(output,input.getTarget(), "target"); output.addField("publication_date", GraphCleaningFunctions.normalizeDate(cleanDate));
final String cleanDate= GraphCleaningFunctions.cleanDate(input.getPublicationDate());
if(cleanDate!= null) if (input.getRelationship() != null && input.getRelationship().getName() != null)
output.addField("publication_date", GraphCleaningFunctions.normalizeDate(cleanDate)); output.addField("relation_name", input.getRelationship().getName());
else
return null;
if (input.getRelationship() != null && input.getRelationship().getInverse() != null)
output.addField("relation_inverse", input.getRelationship().getInverse());
if (input.getRelationship()!= null && input.getRelationship().getName()!= null) if (input.getLinkprovider() != null) {
output.addField("relation_name", input.getRelationship().getName()); final List<String> linkProviders = input
else .getLinkprovider()
return null; .stream()
if (input.getRelationship()!= null && input.getRelationship().getInverse()!= null) .map(ScholixEntityId::getName)
output.addField("relation_inverse", input.getRelationship().getInverse()); .filter(Objects::nonNull)
.collect(Collectors.toList());
if (input.getLinkprovider()!= null) { output.addField("link_provider", linkProviders);
final List<String> linkProviders = input.getLinkprovider().stream() }
.map(ScholixEntityId::getName) if (input.getPublisher() != null) {
.filter(Objects::nonNull) final List<String> publishers = input
.collect(Collectors.toList()); .getPublisher()
.stream()
.map(ScholixEntityId::getName)
.filter(Objects::nonNull)
.collect(Collectors.toList());
output.addField("publisher_name", publishers);
}
output.addField("link_provider",linkProviders); output.addField("__indexrecordidentifier", input.getIdentifier());
} output.addField("__result", json);
if(input.getPublisher()!= null) { return output;
final List<String> publishers = input.getPublisher().stream()
.map(ScholixEntityId::getName)
.filter(Objects::nonNull)
.collect(Collectors.toList());
output.addField("publisher_name", publishers);
}
output.addField("__indexrecordidentifier", input.getIdentifier()); } catch (IOException e) {
output.addField("__result", json); throw new RuntimeException("Error on convert Scholix");
return output; }
}
} catch (IOException e) { private static void fillEntityField(final SolrInputDocument document, final ScholixResource resource,
throw new RuntimeException("Error on convert Scholix"); final String prefix) {
}
}
document.addField(prefix + "_identifier", resource.getDnetIdentifier());
document.addField(prefix + "_type", resource.getObjectType());
document.addField(prefix + "_publication_date", resource.getPublicationDate());
document.addField(prefix + "_subtype", resource.getObjectSubType());
private static void fillEntityField(final SolrInputDocument document, final ScholixResource resource, final String prefix) { List<String> resourcePIDs = resource
.getIdentifier()
.stream()
.map(ScholixIdentifier::getIdentifier)
.collect(Collectors.toList());
document.addField(prefix + "_pid", resourcePIDs);
document.addField(prefix+"_identifier",resource.getDnetIdentifier()); List<String> resourceSchemas = resource
document.addField(prefix+"_type", resource.getObjectType()); .getIdentifier()
document.addField(prefix+"_publication_date", resource.getPublicationDate()); .stream()
document.addField(prefix+"_subtype", resource.getObjectSubType()); .map(ScholixIdentifier::getSchema)
.collect(Collectors.toList());
document.addField(prefix + "_schema", resourceSchemas);
if (resource.getPublisher() != null) {
List<String> resourcePIDs = resource.getIdentifier().stream() final List<String> publishers = resource
.map(ScholixIdentifier::getIdentifier) .getPublisher()
.collect(Collectors.toList()); .stream()
document.addField(prefix+"_pid", resourcePIDs); .map(ScholixEntityId::getName)
.collect(Collectors.toList());
if (publishers.size() > 0)
document.addField(prefix + "_publisher", publishers);
}
List<String> resourceSchemas = resource.getIdentifier().stream() if (resource.getCollectedFrom() != null) {
.map(ScholixIdentifier::getSchema)
.collect(Collectors.toList());
document.addField(prefix+"_schema", resourceSchemas);
if (resource.getPublisher() != null) {
final List<String> publishers = resource.getPublisher().stream().map(ScholixEntityId::getName).collect(Collectors.toList());
if (publishers.size()>0)
document.addField(prefix+"_publisher", publishers);
}
if (resource.getCollectedFrom() != null) {
final List<String> collectedFrom = resource.getCollectedFrom().stream()
.map(ScholixCollectedFrom::getProvider)
.filter(Objects::nonNull)
.map(ScholixEntityId::getName)
.collect(Collectors.toList());
if (collectedFrom.size()>0)
document.addField(prefix+"_collected_from", collectedFrom);
}
}
final List<String> collectedFrom = resource
.getCollectedFrom()
.stream()
.map(ScholixCollectedFrom::getProvider)
.filter(Objects::nonNull)
.map(ScholixEntityId::getName)
.collect(Collectors.toList());
if (collectedFrom.size() > 0)
document.addField(prefix + "_collected_from", collectedFrom);
}
}
} }

View File

@ -1,6 +1,17 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr; import static org.junit.jupiter.api.Assertions.*;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
@ -17,176 +28,157 @@ import org.dom4j.Node;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import java.io.*; import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import static org.junit.jupiter.api.Assertions.*;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class ScholixIndexingTest extends SolrTest{ public class ScholixIndexingTest extends SolrTest {
private static String LAYOUT_PATH="/eu/dnetlib/dhp/oa/provision/SMF_layout.xml"; private static String LAYOUT_PATH = "/eu/dnetlib/dhp/oa/provision/SMF_layout.xml";
/**
* This test verifies that the schema will be generated correctly
* by get the profile of the metadataFormat and generating solr schema.xml
* we expect that the fiedl in the metadataFormat are all in the field solr schema
* @throws Exception
*/
@Test
@Order(1)
void testSchemaCreation() throws Exception {
/** final String layout = loadSMFLayout();
* This test verifies that the schema will be generated correctly assertNotNull(layout);
* by get the profile of the metadataFormat and generating solr schema.xml assertTrue(StringUtils.isNotBlank(layout));
* we expect that the fiedl in the metadataFormat are all in the field solr schema
* @throws Exception
*/
@Test
@Order(1)
void testSchemaCreation() throws Exception {
final String layout = loadSMFLayout(); final String scheme = SolrUtil.getSchemaXML(loadSMFLayout());
assertNotNull(layout); assertNotNull(scheme);
assertTrue(StringUtils.isNotBlank(layout)); assertTrue(StringUtils.isNotBlank(scheme));
final String scheme = SolrUtil.getSchemaXML(loadSMFLayout()); final Document fields = parseDocument(layout);
assertNotNull(scheme); List<Node> params = fields.selectNodes("//FIELD");
assertTrue(StringUtils.isNotBlank(scheme)); final List<String> exptectedFieldName = new ArrayList<>();
for (Node param : params) {
Element element = (Element) param;
String name = element.attributeValue("name");
exptectedFieldName.add(name.toLowerCase());
}
assertTrue(exptectedFieldName.size() > 0);
final Document parsedScheme = parseDocument(scheme);
params = parsedScheme.selectNodes("//field");
final List<String> createdFieldName = new ArrayList<>();
for (Node param : params) {
final Document fields = parseDocument(layout); Element element = (Element) param;
List<Node> params = fields.selectNodes("//FIELD"); String name = element.attributeValue("name");
final List<String> exptectedFieldName = new ArrayList<>(); createdFieldName.add(name.toLowerCase());
for (Node param : params) { }
Element element = (Element) param; assertTrue(createdFieldName.size() > 0);
String name = element.attributeValue("name");
exptectedFieldName.add(name.toLowerCase());
}
assertTrue(exptectedFieldName.size()>0);
exptectedFieldName.stream().map(createdFieldName::contains).forEach(Assertions::assertTrue);
}
final Document parsedScheme = parseDocument(scheme); /***
params = parsedScheme.selectNodes("//field"); * Test the creation of the index works
final List<String> createdFieldName = new ArrayList<>(); * we test if all the files are uploaded into
for (Node param : params) { * the zookeeper instance of SOLR under it's
* collection name
* @throws Exception
*/
@Test
@Order(2)
public void testCreateCollection() throws Exception {
final String collectionName = "SMF-index-scholix";
SolrUtil.uploadZookeperConfig(miniCluster.getZkClient(), collectionName, true, loadSMFLayout());
Element element = (Element) param; assertTrue(miniCluster.getZkClient().exists("/configs/" + collectionName, true));
String name = element.attributeValue("name"); List<String> items = miniCluster.getZkClient().getChildren("/configs/" + collectionName, null, true);
createdFieldName.add(name.toLowerCase());
}
assertTrue(createdFieldName.size()>0);
exptectedFieldName.stream().map(createdFieldName::contains).forEach(Assertions::assertTrue); List<String> configurationFiles = Files
} .list(
Paths
.get(
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_FILE_BASE_PATH)).getPath()))
.map(Path::getFileName)
.map(Path::toString)
.collect(Collectors.toList());
configurationFiles.add("schema.xml");
configurationFiles.add("solrconfig.xml");
configurationFiles.forEach(s -> assertTrue(items.contains(s)));
/*** SolrUtil.createCollection(miniCluster.getSolrClient(), "Scholix", 4, 1, 2, collectionName);
* Test the creation of the index works
* we test if all the files are uploaded into
* the zookeeper instance of SOLR under it's
* collection name
* @throws Exception
*/
@Test
@Order(2)
public void testCreateCollection() throws Exception {
final String collectionName ="SMF-index-scholix";
SolrUtil.uploadZookeperConfig(miniCluster.getZkClient(),collectionName,true, loadSMFLayout() );
assertTrue(miniCluster.getZkClient().exists("/configs/"+collectionName, true)); log.debug("Collection Created");
List<String> items = miniCluster.getZkClient().getChildren("/configs/"+collectionName, null, true); final Map<String, String> queryParamMap = new HashMap<>();
queryParamMap.put("q", "*:*");
List<String> configurationFiles = MapSolrParams queryParams = new MapSolrParams(queryParamMap);
Files.list( final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
Paths.get( final SolrDocumentList documents = response.getResults();
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_FILE_BASE_PATH)).getPath())) assertEquals(0, documents.getNumFound());
.map(Path::getFileName)
.map(Path::toString)
.collect(Collectors.toList());
configurationFiles.add("schema.xml");
configurationFiles.add("solrconfig.xml");
configurationFiles.forEach(s->assertTrue(items.contains(s)));
SolrUtil.createCollection(miniCluster.getSolrClient(), "Scholix", 4,1,2,collectionName); }
log.debug("Collection Created"); @Test
final Map<String, String> queryParamMap = new HashMap<>(); @Order(3)
queryParamMap.put("q", "*:*"); public void testFeedingSolrDocument() throws Exception {
MapSolrParams queryParams = new MapSolrParams(queryParamMap); InputStream gzipStream = new GZIPInputStream(
final QueryResponse response =miniCluster.getSolrClient().query("Scholix", queryParams); Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/provision/scholix_records.gz")));
final SolrDocumentList documents = response.getResults(); Reader decoder = new InputStreamReader(gzipStream, StandardCharsets.UTF_8);
assertEquals(0, documents.getNumFound()); BufferedReader buffered = new BufferedReader(decoder);
String line = buffered.readLine();
} final CloudSolrClient client = miniCluster.getSolrClient();
client.setDefaultCollection("Scholix");
int added = 0;
while (line != null) {
final SolrInputDocument solrDocument = ScholixToSolr.toSolrDocument(line);
@Test client.add(solrDocument);
@Order(3) added++;
public void testFeedingSolrDocument() throws Exception { line = buffered.readLine();
}
client.commit();
InputStream gzipStream = new GZIPInputStream(Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/provision/scholix_records.gz"))); log.debug(String.format("Feed %d documents", added));
Reader decoder = new InputStreamReader(gzipStream, StandardCharsets.UTF_8);
BufferedReader buffered = new BufferedReader(decoder);
String line = buffered.readLine();
final CloudSolrClient client = miniCluster.getSolrClient(); final SolrDocumentList documents = executeQuery("*:*");
client.setDefaultCollection("Scholix"); assertEquals(added, documents.getNumFound());
int added = 0;
while (line!= null) {
documents.stream().map(s -> s.getFirstValue("source_pid").toString()).forEach(System.out::println);
final SolrInputDocument solrDocument = ScholixToSolr.toSolrDocument(line); SolrDocumentList source_pids = executeQuery("source_pid:\"10.15468/dl.u47azs\"");
client.add(solrDocument); System.out.println("source_pid.getNumFound() = " + source_pids.getNumFound());
added ++;
line = buffered.readLine();
}
client.commit(); source_pids.stream().map(s -> s.getFieldValue("source_pid")).forEach(System.out::println);
log.debug(String.format("Feed %d documents",added)); }
private SolrDocumentList executeQuery(final String query) throws SolrServerException, IOException {
final SolrDocumentList documents = executeQuery("*:*"); final Map<String, String> queryParamMap = new HashMap<>();
assertEquals(added, documents.getNumFound()); queryParamMap.put("q", query);
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
return response.getResults();
}
/***
* Utility for parsing XML
* @param xml
* @return Dom4J Document
* @throws DocumentException
*/
private Document parseDocument(final String xml) throws DocumentException {
return new SAXReader().read(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
}
documents.stream().map(s-> s.getFirstValue("source_pid").toString()).forEach(System.out::println); private String loadSMFLayout() throws IOException {
return IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(LAYOUT_PATH)));
SolrDocumentList source_pids = executeQuery("source_pid:\"10.15468/dl.u47azs\""); }
System.out.println("source_pid.getNumFound() = " + source_pids.getNumFound());
source_pids.stream().map(s -> s.getFieldValue("source_pid")).forEach(System.out::println);
}
private SolrDocumentList executeQuery(final String query) throws SolrServerException, IOException {
final Map<String, String> queryParamMap = new HashMap<>();
queryParamMap.put("q", query);
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
final QueryResponse response =miniCluster.getSolrClient().query("Scholix", queryParams);
return response.getResults();
}
/***
* Utility for parsing XML
* @param xml
* @return Dom4J Document
* @throws DocumentException
*/
private Document parseDocument(final String xml) throws DocumentException {
return new SAXReader().read(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
}
private String loadSMFLayout() throws IOException {
return IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(LAYOUT_PATH)));
}
} }

View File

@ -1,12 +1,12 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.apache.solr.client.solrj.response.SolrPingResponse; import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.client.solrj.response.UpdateResponse;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
class SolrAdminApplicationTest extends SolrTest { class SolrAdminApplicationTest extends SolrTest {
@Test @Test

View File

@ -352,7 +352,9 @@
</goals> </goals>
<configuration> <configuration>
<tasks> <tasks>
<!--suppress UnresolvedMavenProperty -->
<property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" /> <property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
<!--suppress UnresolvedMavenProperty -->
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" /> <unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
</tasks> </tasks>
</configuration> </configuration>
@ -427,9 +429,12 @@
<configuration> <configuration>
<executable>ssh</executable> <executable>ssh</executable>
<arguments> <arguments>
<!--suppress UnresolvedMavenProperty -->
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument> <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
<!--suppress UnresolvedMavenProperty -->
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument> <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
<argument>-o StrictHostKeyChecking=no</argument> <argument>-o StrictHostKeyChecking=no</argument>
<!--suppress UnresolvedMavenProperty -->
<argument>rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/</argument> <argument>rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/</argument>
</arguments> </arguments>
</configuration> </configuration>
@ -443,9 +448,11 @@
<configuration> <configuration>
<executable>scp</executable> <executable>scp</executable>
<arguments> <arguments>
<!--suppress UnresolvedMavenProperty -->
<argument>-P ${dhp.hadoop.frontend.port.ssh}</argument> <argument>-P ${dhp.hadoop.frontend.port.ssh}</argument>
<argument>-o StrictHostKeyChecking=no</argument> <argument>-o StrictHostKeyChecking=no</argument>
<argument>target/${oozie.package.file.name}.tar.gz</argument> <argument>target/${oozie.package.file.name}.tar.gz</argument>
<!--suppress UnresolvedMavenProperty -->
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz</argument> <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz</argument>
</arguments> </arguments>
</configuration> </configuration>
@ -460,11 +467,15 @@
<executable>ssh</executable> <executable>ssh</executable>
<!-- <outputFile>target/redirected_upload.log</outputFile> --> <!-- <outputFile>target/redirected_upload.log</outputFile> -->
<arguments> <arguments>
<!--suppress UnresolvedMavenProperty -->
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument> <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
<!--suppress UnresolvedMavenProperty -->
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument> <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
<argument>-o StrictHostKeyChecking=no</argument> <argument>-o StrictHostKeyChecking=no</argument>
<!--suppress UnresolvedMavenProperty -->
<argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument> <argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
<argument>tar -zxf oozie-package.tar.gz; </argument> <argument>tar -zxf oozie-package.tar.gz; </argument>
<!--suppress UnresolvedMavenProperty -->
<argument>rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; </argument> <argument>rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; </argument>
<argument>./upload_workflow.sh</argument> <argument>./upload_workflow.sh</argument>
</arguments> </arguments>
@ -495,9 +506,12 @@
<!-- this file will be used by test verification profile reading job identifier --> <!-- this file will be used by test verification profile reading job identifier -->
<outputFile>${oozie.execution.log.file.location}</outputFile> <outputFile>${oozie.execution.log.file.location}</outputFile>
<arguments> <arguments>
<!--suppress UnresolvedMavenProperty -->
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument> <argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
<!--suppress UnresolvedMavenProperty -->
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument> <argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
<argument>-o StrictHostKeyChecking=no</argument> <argument>-o StrictHostKeyChecking=no</argument>
<!--suppress UnresolvedMavenProperty -->
<argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument> <argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
<argument>./run_workflow.sh</argument> <argument>./run_workflow.sh</argument>
</arguments> </arguments>
@ -512,6 +526,7 @@
<configuration> <configuration>
<executable>cat</executable> <executable>cat</executable>
<arguments> <arguments>
<!--suppress UnresolvedMavenProperty -->
<argument>${oozie.execution.log.file.location}</argument> <argument>${oozie.execution.log.file.location}</argument>
</arguments> </arguments>
</configuration> </configuration>