forked from D-Net/dnet-hadoop
186 lines
6.2 KiB
Java
186 lines
6.2 KiB
Java
|
|
package eu.dnetlib.dhp.oa.provision;
|
|
|
|
import static org.junit.jupiter.api.Assertions.*;
|
|
|
|
import java.io.*;
|
|
import java.nio.charset.StandardCharsets;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.nio.file.Paths;
|
|
import java.util.*;
|
|
import java.util.stream.Collectors;
|
|
import java.util.zip.GZIPInputStream;
|
|
|
|
import org.apache.commons.io.IOUtils;
|
|
import org.apache.commons.lang3.StringUtils;
|
|
import org.apache.solr.client.solrj.SolrServerException;
|
|
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
|
import org.apache.solr.client.solrj.response.QueryResponse;
|
|
import org.apache.solr.common.SolrDocument;
|
|
import org.apache.solr.common.SolrDocumentList;
|
|
import org.apache.solr.common.SolrInputDocument;
|
|
import org.apache.solr.common.params.MapSolrParams;
|
|
import org.dom4j.Document;
|
|
import org.dom4j.DocumentException;
|
|
import org.dom4j.Element;
|
|
import org.dom4j.Node;
|
|
import org.dom4j.io.SAXReader;
|
|
import org.junit.jupiter.api.*;
|
|
|
|
import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
|
|
|
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
|
public class ScholixIndexingTest extends SolrTest {
|
|
|
|
private static String LAYOUT_PATH = "/eu/dnetlib/dhp/oa/provision/SMF_layout.xml";
|
|
|
|
/**
|
|
* This test verifies that the schema will be generated correctly
|
|
* by get the profile of the metadataFormat and generating solr schema.xml
|
|
* we expect that the fiedl in the metadataFormat are all in the field solr schema
|
|
* @throws Exception
|
|
*/
|
|
@Test
|
|
@Order(1)
|
|
void testSchemaCreation() throws Exception {
|
|
|
|
final String layout = loadSMFLayout();
|
|
assertNotNull(layout);
|
|
assertTrue(StringUtils.isNotBlank(layout));
|
|
|
|
final String scheme = SolrUtil.getSchemaXML(loadSMFLayout());
|
|
assertNotNull(scheme);
|
|
assertTrue(StringUtils.isNotBlank(scheme));
|
|
|
|
final Document fields = parseDocument(layout);
|
|
List<Node> params = fields.selectNodes("//FIELD");
|
|
final List<String> exptectedFieldName = new ArrayList<>();
|
|
for (Node param : params) {
|
|
Element element = (Element) param;
|
|
String name = element.attributeValue("name");
|
|
exptectedFieldName.add(name.toLowerCase());
|
|
}
|
|
assertTrue(exptectedFieldName.size() > 0);
|
|
|
|
final Document parsedScheme = parseDocument(scheme);
|
|
params = parsedScheme.selectNodes("//field");
|
|
final List<String> createdFieldName = new ArrayList<>();
|
|
for (Node param : params) {
|
|
|
|
Element element = (Element) param;
|
|
String name = element.attributeValue("name");
|
|
createdFieldName.add(name.toLowerCase());
|
|
}
|
|
assertTrue(createdFieldName.size() > 0);
|
|
|
|
exptectedFieldName.stream().map(createdFieldName::contains).forEach(Assertions::assertTrue);
|
|
}
|
|
|
|
/***
|
|
* Test the creation of the index works
|
|
* we test if all the files are uploaded into
|
|
* the zookeeper instance of SOLR under it's
|
|
* collection name
|
|
* @throws Exception
|
|
*/
|
|
@Test
|
|
@Order(2)
|
|
public void testCreateCollection() throws Exception {
|
|
final String collectionName = "SMF-index-scholix";
|
|
SolrUtil.uploadZookeperConfig(miniCluster.getZkClient(), collectionName, true, loadSMFLayout());
|
|
|
|
assertTrue(miniCluster.getZkClient().exists("/configs/" + collectionName, true));
|
|
List<String> items = miniCluster.getZkClient().getChildren("/configs/" + collectionName, null, true);
|
|
|
|
List<String> configurationFiles = Files
|
|
.list(
|
|
Paths
|
|
.get(
|
|
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_BASE_PATH + "files/")).getPath()))
|
|
.map(Path::getFileName)
|
|
.filter(p -> !p.getFileName().toString().equalsIgnoreCase("file_list"))
|
|
.map(Path::toString)
|
|
.collect(Collectors.toList());
|
|
configurationFiles.add("schema.xml");
|
|
configurationFiles.add("solrconfig.xml");
|
|
configurationFiles.forEach(s -> assertTrue(items.contains(s)));
|
|
|
|
SolrUtil.createCollection(miniCluster.getSolrClient(), "Scholix", 4, 1, 2, collectionName);
|
|
|
|
log.debug("Collection Created");
|
|
final Map<String, String> queryParamMap = new HashMap<>();
|
|
queryParamMap.put("q", "*:*");
|
|
|
|
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
|
|
final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
|
|
final SolrDocumentList documents = response.getResults();
|
|
assertEquals(0, documents.getNumFound());
|
|
|
|
}
|
|
|
|
@Test
|
|
@Order(3)
|
|
public void testFeedingSolrDocument() throws Exception {
|
|
|
|
InputStream gzipStream = new GZIPInputStream(
|
|
Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/provision/scholix_records.gz")));
|
|
Reader decoder = new InputStreamReader(gzipStream, StandardCharsets.UTF_8);
|
|
BufferedReader buffered = new BufferedReader(decoder);
|
|
String line = buffered.readLine();
|
|
|
|
final CloudSolrClient client = miniCluster.getSolrClient();
|
|
client.setDefaultCollection("Scholix");
|
|
int added = 0;
|
|
while (line != null) {
|
|
|
|
final SolrInputDocument solrDocument = ScholixToSolr.toSolrDocument(line);
|
|
|
|
client.add(solrDocument);
|
|
added++;
|
|
line = buffered.readLine();
|
|
}
|
|
|
|
client.commit();
|
|
|
|
log.debug(String.format("Feed %d documents", added));
|
|
|
|
final SolrDocumentList documents = executeQuery("*:*");
|
|
assertEquals(added, documents.getNumFound());
|
|
|
|
documents.stream().map(s -> s.getFirstValue("source_pid").toString()).forEach(System.out::println);
|
|
|
|
SolrDocumentList source_pids = executeQuery("source_pid:\"10.15468/dl.u47azs\"");
|
|
|
|
System.out.println("source_pid.getNumFound() = " + source_pids.getNumFound());
|
|
|
|
source_pids.stream().map(s -> s.getFieldValue("source_pid")).forEach(System.out::println);
|
|
|
|
}
|
|
|
|
private SolrDocumentList executeQuery(final String query) throws SolrServerException, IOException {
|
|
|
|
final Map<String, String> queryParamMap = new HashMap<>();
|
|
queryParamMap.put("q", query);
|
|
|
|
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
|
|
final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
|
|
return response.getResults();
|
|
}
|
|
|
|
/***
|
|
* Utility for parsing XML
|
|
* @param xml
|
|
* @return Dom4J Document
|
|
* @throws DocumentException
|
|
*/
|
|
private Document parseDocument(final String xml) throws DocumentException {
|
|
return new SAXReader().read(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
|
|
}
|
|
|
|
private String loadSMFLayout() throws IOException {
|
|
return IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(LAYOUT_PATH)));
|
|
}
|
|
|
|
}
|