dnet-hadoop/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/ScholixIndexingTest.java

186 lines
6.2 KiB
Java

package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.*;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.*;
import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class ScholixIndexingTest extends SolrTest {
private static String LAYOUT_PATH = "/eu/dnetlib/dhp/oa/provision/SMF_layout.xml";
/**
* This test verifies that the schema will be generated correctly
* by get the profile of the metadataFormat and generating solr schema.xml
* we expect that the fiedl in the metadataFormat are all in the field solr schema
* @throws Exception
*/
@Test
@Order(1)
void testSchemaCreation() throws Exception {
final String layout = loadSMFLayout();
assertNotNull(layout);
assertTrue(StringUtils.isNotBlank(layout));
final String scheme = SolrUtil.getSchemaXML(loadSMFLayout());
assertNotNull(scheme);
assertTrue(StringUtils.isNotBlank(scheme));
final Document fields = parseDocument(layout);
List<Node> params = fields.selectNodes("//FIELD");
final List<String> exptectedFieldName = new ArrayList<>();
for (Node param : params) {
Element element = (Element) param;
String name = element.attributeValue("name");
exptectedFieldName.add(name.toLowerCase());
}
assertTrue(exptectedFieldName.size() > 0);
final Document parsedScheme = parseDocument(scheme);
params = parsedScheme.selectNodes("//field");
final List<String> createdFieldName = new ArrayList<>();
for (Node param : params) {
Element element = (Element) param;
String name = element.attributeValue("name");
createdFieldName.add(name.toLowerCase());
}
assertTrue(createdFieldName.size() > 0);
exptectedFieldName.stream().map(createdFieldName::contains).forEach(Assertions::assertTrue);
}
/***
* Test the creation of the index works
* we test if all the files are uploaded into
* the zookeeper instance of SOLR under it's
* collection name
* @throws Exception
*/
@Test
@Order(2)
public void testCreateCollection() throws Exception {
final String collectionName = "SMF-index-scholix";
SolrUtil.uploadZookeperConfig(miniCluster.getZkClient(), collectionName, true, loadSMFLayout());
assertTrue(miniCluster.getZkClient().exists("/configs/" + collectionName, true));
List<String> items = miniCluster.getZkClient().getChildren("/configs/" + collectionName, null, true);
List<String> configurationFiles = Files
.list(
Paths
.get(
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_BASE_PATH + "files/")).getPath()))
.map(Path::getFileName)
.filter(p -> !p.getFileName().toString().equalsIgnoreCase("file_list"))
.map(Path::toString)
.collect(Collectors.toList());
configurationFiles.add("schema.xml");
configurationFiles.add("solrconfig.xml");
configurationFiles.forEach(s -> assertTrue(items.contains(s)));
SolrUtil.createCollection(miniCluster.getSolrClient(), "Scholix", 4, 1, 2, collectionName);
log.debug("Collection Created");
final Map<String, String> queryParamMap = new HashMap<>();
queryParamMap.put("q", "*:*");
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
final SolrDocumentList documents = response.getResults();
assertEquals(0, documents.getNumFound());
}
@Test
@Order(3)
public void testFeedingSolrDocument() throws Exception {
InputStream gzipStream = new GZIPInputStream(
Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/provision/scholix_records.gz")));
Reader decoder = new InputStreamReader(gzipStream, StandardCharsets.UTF_8);
BufferedReader buffered = new BufferedReader(decoder);
String line = buffered.readLine();
final CloudSolrClient client = miniCluster.getSolrClient();
client.setDefaultCollection("Scholix");
int added = 0;
while (line != null) {
final SolrInputDocument solrDocument = ScholixToSolr.toSolrDocument(line);
client.add(solrDocument);
added++;
line = buffered.readLine();
}
client.commit();
log.debug(String.format("Feed %d documents", added));
final SolrDocumentList documents = executeQuery("*:*");
assertEquals(added, documents.getNumFound());
documents.stream().map(s -> s.getFirstValue("source_pid").toString()).forEach(System.out::println);
SolrDocumentList source_pids = executeQuery("source_pid:\"10.15468/dl.u47azs\"");
System.out.println("source_pid.getNumFound() = " + source_pids.getNumFound());
source_pids.stream().map(s -> s.getFieldValue("source_pid")).forEach(System.out::println);
}
private SolrDocumentList executeQuery(final String query) throws SolrServerException, IOException {
final Map<String, String> queryParamMap = new HashMap<>();
queryParamMap.put("q", query);
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
return response.getResults();
}
/***
* Utility for parsing XML
* @param xml
* @return Dom4J Document
* @throws DocumentException
*/
private Document parseDocument(final String xml) throws DocumentException {
return new SAXReader().read(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
}
private String loadSMFLayout() throws IOException {
return IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(LAYOUT_PATH)));
}
}