solr index refactoring

This commit is contained in:
Michele Artini 2023-10-30 16:00:10 +01:00
parent 0538c16484
commit ab1d2c35e4
9 changed files with 175 additions and 155 deletions

View File

@ -11,6 +11,9 @@ import eu.dnetlib.common.index.solr.SolrService;
@SpringBootApplication
public class IndexManagerApplication extends AbstractDnetApp {
@Value("${solr.cloud}")
private boolean solrCloud;
@Value("${solr.urls}")
private String[] solrUrls;
@ -20,6 +23,6 @@ public class IndexManagerApplication extends AbstractDnetApp {
@Bean
public SolrService solrService() {
return new SolrService(solrUrls);
return new SolrService(solrCloud, solrUrls);
}
}

View File

@ -45,6 +45,9 @@ public class WfExecutorApplication extends AbstractDnetApp {
@Value("${mdstores.data.datasource.password}")
private String databasePassword;
@Value("${solr.cloud}")
private boolean solrCloud;
@Value("${solr.urls}")
private String[] solrUrls;
@ -70,7 +73,7 @@ public class WfExecutorApplication extends AbstractDnetApp {
@Bean
public SolrService solrService() {
return new SolrService(solrUrls);
return new SolrService(solrCloud, solrUrls);
}
}

View File

@ -4,8 +4,6 @@ import java.io.Serializable;
import jakarta.persistence.Column;
import jakarta.persistence.Entity;
import jakarta.persistence.EnumType;
import jakarta.persistence.Enumerated;
import jakarta.persistence.Id;
import jakarta.persistence.IdClass;
import jakarta.persistence.Table;
@ -31,50 +29,9 @@ public class IndexField implements Serializable {
@Column(name = "xpath")
private String xpath;
@Enumerated(EnumType.STRING)
@Column(name = "type")
private IndexFieldType type;
// TODO (HIGH PRIORITY): DELETE ??
@Deprecated
@Column(name = "indexable")
private boolean indexable;
// TODO (HIGH PRIORITY): DELETE ??
@Deprecated
@Column(name = "result")
private boolean result;
// TODO (HIGH PRIORITY): DELETE ??
@Deprecated
@Column(name = "header")
private boolean header;
// TODO (HIGH PRIORITY): DELETE ??
@Deprecated
@Column(name = "stat")
private boolean stat;
// TODO (HIGH PRIORITY): DELETE ??
@Deprecated
@Column(name = "tokenizable")
private boolean tokenizable;
// TODO (HIGH PRIORITY): DELETE ??
@Deprecated
@Column(name = "multivalued")
private boolean multiValued;
// TODO (HIGH PRIORITY): DELETE ??
@Deprecated
@Column(name = "stored")
private boolean stored;
// TODO (HIGH PRIORITY): DELETE ??
@Deprecated
@Column(name = "copy")
private boolean copy;
public String getIndexId() {
return indexId;
}
@ -107,92 +64,12 @@ public class IndexField implements Serializable {
this.constant = constant;
}
public IndexFieldType getType() {
return type;
}
public void setType(final IndexFieldType type) {
this.type = type;
}
@Deprecated
public boolean isIndexable() {
return indexable;
}
@Deprecated
public void setIndexable(final boolean indexable) {
this.indexable = indexable;
}
@Deprecated
public boolean isResult() {
return result;
}
@Deprecated
public void setResult(final boolean result) {
this.result = result;
}
@Deprecated
public boolean isHeader() {
return header;
}
@Deprecated
public void setHeader(final boolean header) {
this.header = header;
}
@Deprecated
public boolean isStat() {
return stat;
}
@Deprecated
public void setStat(final boolean stat) {
this.stat = stat;
}
@Deprecated
public boolean isTokenizable() {
return tokenizable;
}
@Deprecated
public void setTokenizable(final boolean tokenizable) {
this.tokenizable = tokenizable;
}
@Deprecated
public boolean isMultiValued() {
return multiValued;
}
@Deprecated
public void setMultiValued(final boolean multiValued) {
this.multiValued = multiValued;
}
@Deprecated
public boolean isStored() {
return stored;
}
@Deprecated
public void setStored(final boolean stored) {
this.stored = stored;
}
@Deprecated
public boolean isCopy() {
return copy;
}
@Deprecated
public void setCopy(final boolean copy) {
this.copy = copy;
}
}

View File

@ -1,5 +0,0 @@
package eu.dnetlib.domain.index;
public enum IndexFieldType {
STRING, DOUBLE, BOOLEAN, LONG, DATE, DATETIME
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.common.index.solr;
import eu.dnetlib.domain.index.IndexField;
import jakarta.persistence.Transient;
public class SolrField extends IndexField {
private static final long serialVersionUID = -8910762200990817492L;
@Transient
private final String type;
@Transient
private final boolean multiValued;
public SolrField(final IndexField field, final String type, final boolean multiValued) {
setName(field.getName());
setConstant(field.getConstant());
setIndexId(field.getIndexId());
setXpath(field.getXpath());
setResult(field.isResult());
this.type = type;
this.multiValued = multiValued;
}
public String getType() {
return type;
}
public boolean isMultiValued() {
return multiValued;
}
}

View File

@ -2,15 +2,16 @@ package eu.dnetlib.common.index.solr;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.client.solrj.SolrClient;
@ -18,11 +19,11 @@ import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.ORDER;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.Http2SolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.schema.SchemaRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.client.solrj.response.schema.SchemaRepresentation;
import org.apache.solr.client.solrj.response.schema.SchemaResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
@ -33,10 +34,8 @@ import org.dom4j.Node;
import eu.dnetlib.domain.index.IndexConfiguration;
import eu.dnetlib.domain.index.IndexField;
import eu.dnetlib.domain.index.IndexFieldType;
import eu.dnetlib.errors.DnetException;
import eu.dnetlib.errors.DnetRuntimeException;
import eu.dnetlib.utils.DateUtils;
public class SolrService {
@ -46,9 +45,11 @@ public class SolrService {
// https://solr.apache.org/guide/solr/latest/deployment-guide/solrj.html
private final boolean cloud;
private final String[] solrUrls;
public SolrService(final String... solrUrls) {
public SolrService(final boolean cloud, final String... solrUrls) {
this.cloud = cloud;
this.solrUrls = solrUrls;
}
@ -60,8 +61,6 @@ public class SolrService {
query.setStart(from);
query.setRows(limit);
// TODO (HIGH PRIORITY) the result fields should be obtained using findSolrSchema()
conf.getFields()
.stream()
.filter(IndexField::isResult)
@ -89,7 +88,8 @@ public class SolrService {
public int indexRecord(final IndexConfiguration conf, final String xml, final boolean commit) throws DnetException {
try (final SolrClient solr = newSolrClient()) {
final UpdateResponse updateResponse = solr.add(conf.getId(), asSolrDocument(conf, xml));
final Set<SolrField> solrFields = findSolrFields(conf);
final UpdateResponse updateResponse = solr.add(conf.getId(), asSolrDocument(xml, solrFields));
if (commit) {
forceCommit(solr, conf.getId());
}
@ -103,7 +103,8 @@ public class SolrService {
public int indexRecords(final IndexConfiguration conf, final Stream<String> inputStream) throws DnetException {
try (final SolrClient solr = newSolrClient()) {
final Iterator<SolrInputDocument> iterator = inputStream.map(s -> asSolrDocument(conf, s)).iterator();
final Set<SolrField> solrFields = findSolrFields(conf);
final Iterator<SolrInputDocument> iterator = inputStream.map(xml -> asSolrDocument(xml, solrFields)).iterator();
final UpdateResponse updateResponse = solr.add(conf.getId(), iterator);
forceCommit(solr, conf.getId());
return updateResponse.getResponse().size();
@ -123,21 +124,19 @@ public class SolrService {
}
private SolrClient newSolrClient() {
return new CloudSolrClient.Builder(Arrays.asList(solrUrls)).build();
return cloud ? new CloudSolrClient.Builder(Arrays.asList(solrUrls)).build() : new Http2SolrClient.Builder(solrUrls[0]).build();
}
private void forceCommit(final SolrClient client, final String solrCollection) throws SolrServerException, IOException {
client.commit(solrCollection);
}
private SolrInputDocument asSolrDocument(final IndexConfiguration conf, final String xml) {
private SolrInputDocument asSolrDocument(final String xml, final Set<SolrField> solrFields) {
try {
final Document xmlDoc = DocumentHelper.parseText(xml);
// TODO (HIGH PRIORITY) the multiValued fields should be obtained using findSolrSchema()
final SolrInputDocument doc = new SolrInputDocument();
conf.getFields()
solrFields
.stream()
.filter(f -> StringUtils.isNotBlank(f.getName()))
.forEach(f -> {
@ -166,14 +165,16 @@ public class SolrService {
}
}
private Object convertToType(final String s, final IndexFieldType type) {
return switch (type) {
case STRING -> s;
case LONG -> NumberUtils.toLong(s);
case DOUBLE -> NumberUtils.toDouble(s);
case BOOLEAN -> BooleanUtils.toBoolean(s);
case DATE -> DateUtils.parseDate(s);
case DATETIME -> DateUtils.parseDateTime(s);
private Object convertToType(final String s, final String solrType) {
// TODO (HIGH PRIORITY): complete the mapping
return switch (solrType) {
case "string", "text_general" -> s;
// case LONG -> NumberUtils.toLong(s);
// case DOUBLE -> NumberUtils.toDouble(s);
case "boolean" -> BooleanUtils.toBoolean(s);
// case DATE -> DateUtils.parseDate(s);
// case DATETIME -> DateUtils.parseDateTime(s);
default -> s;
};
}
@ -196,13 +197,28 @@ public class SolrService {
}
}
protected SchemaRepresentation findSolrSchema(final IndexConfiguration conf) throws DnetException {
protected Set<SolrField> findSolrFields(final IndexConfiguration conf) throws DnetException {
// TODO (HIGH PRIORITY) test and use to obtain info relative to the schema
// TODO (HIGH PRIORITY) the method should return also the not configured fields ???
try (final SolrClient solr = newSolrClient()) {
final SchemaRequest request = new SchemaRequest();
final SchemaResponse response = request.process(solr, conf.getId());
return response.getSchemaRepresentation();
final Set<SolrField> res = new HashSet<>();
for (final Map<String, Object> map : response.getSchemaRepresentation().getFields()) {
final String name = map.getOrDefault("name", "").toString();
for (final IndexField f : conf.getFields()) {
if (StringUtils.equals(f.getName(), name)) {
final String type = map.getOrDefault("type", "string").toString();
final boolean multivalued = BooleanUtils.toBoolean(map.getOrDefault("multiValued", "false").toString());
res.add(new SolrField(f, type, multivalued));
}
}
}
return res;
} catch (final Throwable e) {
log.error("error deleting index: " + conf.getId(), e);
throw new DnetException("error deleting commit: " + conf.getId(), e);

View File

@ -0,0 +1,90 @@
package eu.dnetlib.common.index.solr;
import static org.junit.jupiter.api.Assertions.fail;
import java.util.Set;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.domain.index.IndexConfiguration;
import eu.dnetlib.errors.DnetException;
class SolrServiceTest {
// TODO (HIGH PRIORITY) Complete the tests
private SolrService solr;
private IndexConfiguration conf;
private static final String TEST_INDEX = "gettingstarted";
@BeforeEach
void setUp() throws Exception {
solr = new SolrService(false, "http://localhost:8983/solr");
conf = new IndexConfiguration();
conf.setId(TEST_INDEX);
}
@Test
void testSolrService() {
fail("Not yet implemented");
}
@Test
void testQuery() {
fail("Not yet implemented");
}
@Test
void testIndexRecord() {
fail("Not yet implemented");
}
@Test
void testIndexRecords() {
fail("Not yet implemented");
}
@Test
void testCommit() {
fail("Not yet implemented");
}
@Test
void testExistsIndex() {
fail("Not yet implemented");
}
@Test
void testCreateIndex() {
fail("Not yet implemented");
}
@Test
void testFindSolrSchema() throws DnetException, JsonProcessingException {
final Set<SolrField> solrFields = solr.findSolrFields(conf);
System.out.println(new ObjectMapper().writeValueAsString(solrFields));
}
@Test
void testDeleteIndex() {
fail("Not yet implemented");
}
@Test
void testDeleteByQuery() {
fail("Not yet implemented");
}
@Test
void testDeleteOldRecords() {
fail("Not yet implemented");
}
}

View File

@ -200,10 +200,13 @@ services:
solr:
image: solr:9.4.0
ports:
- ${SOLR_PORT}:${SOLR_PORT}
expose:
- ${SOLR_PORT}
networks:
- backend
- frontend
volumes:
- solrdata:/var/solr
command:

View File

@ -17,7 +17,7 @@ export PG_VOCS_DB=dnet_vocabularies
export PG_CONTEXTS_DB=dnet_contexts
export PG_MDSTORES_DATA_DB=dnet_mdstores_data
export COMPOSE_PROFILES=base,mail,dsm,vocs,mdstores,wfs
export COMPOSE_PROFILES=base,index
#export COMPOSE_PROFILES=base,mail,dsm,vocs,mdstores,wfs,index,contexts,ui
docker-compose -f docker-compose.dev.yml up --force-recreate --build