forked from D-Net/dnet-hadoop
Compare commits
7 Commits
master
...
scholix_to
Author | SHA1 | Date |
---|---|---|
Sandro La Bruzzo | ffa8cdf981 | |
Sandro La Bruzzo | 818a936468 | |
Sandro La Bruzzo | 4b8739e45b | |
Sandro La Bruzzo | 7784b3d9c4 | |
Sandro La Bruzzo | 6d5cda1a03 | |
Sandro La Bruzzo | bf6c8ccc79 | |
Sandro La Bruzzo | 56f880c89d |
|
@ -3,6 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
|||
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
|
@ -36,6 +38,14 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
|
||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
||||
|
||||
public static final String BLANK = "";
|
||||
|
||||
private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
|
||||
|
||||
private static final String[] normalizeDateFormats = {
|
||||
"yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy"
|
||||
};
|
||||
|
||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
|
@ -459,6 +469,20 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
return Optional.ofNullable(cleanDate(date));
|
||||
}
|
||||
|
||||
public static String normalizeDate(String s) {
|
||||
final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK;
|
||||
|
||||
for (String format : normalizeDateFormats) {
|
||||
try {
|
||||
Date parse = new SimpleDateFormat(format).parse(date);
|
||||
String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
|
||||
return res;
|
||||
} catch (ParseException e) {
|
||||
}
|
||||
}
|
||||
return BLANK;
|
||||
}
|
||||
|
||||
public static String cleanDate(final String inputDate) {
|
||||
|
||||
if (StringUtils.isBlank(inputDate)) {
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.util.Date;
|
|||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
|
@ -14,15 +15,6 @@ import net.sf.saxon.value.SequenceType;
|
|||
import net.sf.saxon.value.StringValue;
|
||||
|
||||
public class NormalizeDate extends AbstractExtensionFunction {
|
||||
|
||||
private static final String[] normalizeDateFormats = {
|
||||
"yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy"
|
||||
};
|
||||
|
||||
private static final String normalizeOutFormat = "yyyy-MM-dd'T'hh:mm:ss'Z'";
|
||||
|
||||
public static final String BLANK = "";
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "normalizeDate";
|
||||
|
@ -31,10 +23,10 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
|||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null || arguments.length == 0) {
|
||||
return new StringValue(BLANK);
|
||||
return new StringValue(GraphCleaningFunctions.BLANK);
|
||||
}
|
||||
String s = arguments[0].head().getStringValue();
|
||||
return new StringValue(_normalizeDate(s));
|
||||
return new StringValue(GraphCleaningFunctions.normalizeDate(s));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -58,18 +50,4 @@ public class NormalizeDate extends AbstractExtensionFunction {
|
|||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
private String _normalizeDate(String s) {
|
||||
final String date = StringUtils.isNotBlank(s) ? s.trim() : BLANK;
|
||||
|
||||
for (String format : normalizeDateFormats) {
|
||||
try {
|
||||
Date parse = new SimpleDateFormat(format).parse(date);
|
||||
String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
|
||||
return res;
|
||||
} catch (ParseException e) {
|
||||
}
|
||||
}
|
||||
return BLANK;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -142,6 +142,21 @@ object DataciteToOAFTransformation {
|
|||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type.
|
||||
* Using the dnet:result_typologies vocabulary, we look up the instance.type synonym
|
||||
* to generate one of the following main entities:
|
||||
* - publication
|
||||
* - dataset
|
||||
* - software
|
||||
* otherresearchproduct
|
||||
|
||||
* @param resourceType
|
||||
* @param resourceTypeGeneral
|
||||
* @param schemaOrg
|
||||
* @param vocabularies
|
||||
* @return
|
||||
*/
|
||||
def getTypeQualifier(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
|
|
|
@ -45,6 +45,10 @@
|
|||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
|
|
|
@ -23,7 +23,7 @@ public class SolrAdminApplication implements Closeable {
|
|||
private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);
|
||||
|
||||
enum Action {
|
||||
DELETE_BY_QUERY, COMMIT
|
||||
DELETE_BY_QUERY, COMMIT, CREATE
|
||||
}
|
||||
|
||||
private final CloudSolrClient solrClient;
|
||||
|
@ -56,6 +56,8 @@ public class SolrAdminApplication implements Closeable {
|
|||
|
||||
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
|
||||
|
||||
final String fields = isLookup.getLayoutSource(format);
|
||||
|
||||
final String zkHost = isLookup.getZkHost();
|
||||
log.info("zkHost: {}", zkHost);
|
||||
|
||||
|
@ -63,7 +65,7 @@ public class SolrAdminApplication implements Closeable {
|
|||
log.info("collection: {}", collection);
|
||||
|
||||
try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
|
||||
app.execute(action, collection, query, commit);
|
||||
app.execute(action, collection, query, commit, fields);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -73,10 +75,10 @@ public class SolrAdminApplication implements Closeable {
|
|||
}
|
||||
|
||||
public SolrResponse commit(String collection) throws IOException, SolrServerException {
|
||||
return execute(Action.COMMIT, collection, null, true);
|
||||
return execute(Action.COMMIT, collection, null, true, null);
|
||||
}
|
||||
|
||||
public SolrResponse execute(Action action, String collection, String query, boolean commit)
|
||||
public SolrResponse execute(Action action, String collection, String query, boolean commit, final String fields)
|
||||
throws IOException, SolrServerException {
|
||||
switch (action) {
|
||||
|
||||
|
@ -88,6 +90,12 @@ public class SolrAdminApplication implements Closeable {
|
|||
return rsp;
|
||||
case COMMIT:
|
||||
return solrClient.commit(collection);
|
||||
case CREATE:
|
||||
SolrUtil
|
||||
.uploadZookeperConfig(this.solrClient.getZkStateReader().getZkClient(), collection, true, fields);
|
||||
SolrUtil.createCollection(this.solrClient, collection, 48, 1, 12, collection);
|
||||
return null;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("action not managed: " + action);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,245 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||
import org.apache.solr.common.cloud.SolrZkClient;
|
||||
import org.apache.solr.common.params.CollectionParams;
|
||||
import org.apache.solr.common.params.CoreAdminParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.zookeeper.CreateMode;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.io.DocumentResult;
|
||||
import org.dom4j.io.DocumentSource;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.stringtemplate.v4.ST;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.type.MapType;
|
||||
import com.fasterxml.jackson.databind.type.TypeFactory;
|
||||
|
||||
public class SolrUtil {
|
||||
|
||||
/**
|
||||
* The log.
|
||||
*/
|
||||
private static final Logger log = LoggerFactory.getLogger(SolrUtil.class);
|
||||
|
||||
/**
|
||||
* The Constant CONFIGS_PATH.
|
||||
*/
|
||||
private static final String CONFIGS_PATH = "/configs";
|
||||
|
||||
private static final char DELIMITER = '$';
|
||||
|
||||
public static final String CONF_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/";
|
||||
|
||||
// public static final String CONF_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/";
|
||||
|
||||
public static final String LIST_FILE_BASE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/files/file_list";
|
||||
|
||||
private static final String SCHEMA_TEMPLATE_PATH = "/eu/dnetlib/dhp/oa/provision/conf/schemaTemplate.xslt";
|
||||
|
||||
private static String createURLRequest = "http://%s:%s/solr/admin/collections?action=CREATE&name=%s&numShards=%s&replicationFactor=%s&maxShardsPerNode=%s&collection.configName=%s";
|
||||
|
||||
private static String generateCreateIndexRequest(final String host,
|
||||
final String port,
|
||||
final String collectionName,
|
||||
final String numShard,
|
||||
final String replicationFactor,
|
||||
final String collectionConfigName,
|
||||
final String maxShardsPerNode) {
|
||||
return String
|
||||
.format(
|
||||
createURLRequest, host, port, collectionName, numShard, replicationFactor, maxShardsPerNode,
|
||||
collectionConfigName);
|
||||
}
|
||||
|
||||
public static boolean createSolrIndex(final String host,
|
||||
final String port,
|
||||
final String collectionName,
|
||||
final String numShard,
|
||||
final String replicationFactor,
|
||||
final String maxShardsPerNode,
|
||||
final String collectionConfigName) throws Exception {
|
||||
|
||||
final String uri = generateCreateIndexRequest(
|
||||
host, port, collectionName, numShard, replicationFactor, maxShardsPerNode, collectionConfigName);
|
||||
|
||||
URL url = new URL(uri);
|
||||
System.out.println(uri);
|
||||
|
||||
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
|
||||
connection.setRequestMethod("GET");
|
||||
int status = connection.getResponseCode();
|
||||
System.out.println("status = " + status);
|
||||
|
||||
BufferedReader in = new BufferedReader(
|
||||
new InputStreamReader(connection.getInputStream()));
|
||||
String inputLine;
|
||||
StringBuffer content = new StringBuffer();
|
||||
while ((inputLine = in.readLine()) != null) {
|
||||
content.append(inputLine);
|
||||
}
|
||||
in.close();
|
||||
|
||||
log.debug("content = " + content);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static void uploadZookeperConfig(final SolrZkClient zkClient,
|
||||
final String coreName,
|
||||
final boolean overwrite,
|
||||
final String layout) {
|
||||
|
||||
final String basepath = CONFIGS_PATH + "/" + coreName;
|
||||
|
||||
log.info("uploading solr configuration to ZK for index collection: " + coreName);
|
||||
try {
|
||||
if (overwrite && zkClient.getSolrZooKeeper().exists(basepath, false) != null) {
|
||||
log.info("cleanup ZK configuration: " + coreName);
|
||||
for (String child : zkClient.getSolrZooKeeper().getChildren(basepath, false)) {
|
||||
final String path = basepath + "/" + child;
|
||||
log.debug("cleanup ZK file: " + path);
|
||||
zkClient.delete(path, -1, true);
|
||||
}
|
||||
zkClient.delete(basepath, -1, true);
|
||||
}
|
||||
if (!zkClient.exists(basepath, true)) {
|
||||
log.info("upload ZK configuration: " + coreName);
|
||||
zkClient.makePath(basepath, true);
|
||||
uploadConfiguration(zkClient, basepath, buildConfiguration(layout));
|
||||
}
|
||||
log.info("upload ZK configuration complete");
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("unable to upload solr configuration", e);
|
||||
}
|
||||
}
|
||||
|
||||
private static void uploadConfiguration(final SolrZkClient zkClient, final String basePath,
|
||||
final Map<String, byte[]> resources) throws KeeperException,
|
||||
InterruptedException, IOException {
|
||||
|
||||
if (!zkClient.exists(basePath, true)) {
|
||||
zkClient.makePath(basePath, true);
|
||||
}
|
||||
|
||||
for (final Map.Entry<String, byte[]> e : resources.entrySet()) {
|
||||
String path = basePath + "/" + e.getKey();
|
||||
log.debug("upload ZK configuration: " + path);
|
||||
zkClient.create(path, e.getValue(), CreateMode.PERSISTENT, true);
|
||||
}
|
||||
}
|
||||
|
||||
private static String loadFileInClassPath(final String aPath) {
|
||||
System.out.println("LOAD FILE FROM PATH: " + aPath);
|
||||
try {
|
||||
return IOUtils
|
||||
.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(aPath)), Charset.defaultCharset());
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public static Map<String, String> getServiceProperties() throws IOException {
|
||||
final String properties = loadFileInClassPath(CONF_BASE_PATH + "service_properties.json");
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
TypeFactory typeFactory = mapper.getTypeFactory();
|
||||
MapType mapType = typeFactory.constructMapType(HashMap.class, String.class, String.class);
|
||||
return mapper.readValue(properties, mapType);
|
||||
}
|
||||
|
||||
public static String getConfig() throws Exception {
|
||||
final Map<String, String> p = getServiceProperties();
|
||||
final String st = loadFileInClassPath(CONF_BASE_PATH + "solrconfig.xml.st");
|
||||
final ST solrConfig = new ST(st, DELIMITER, DELIMITER);
|
||||
p.forEach(solrConfig::add);
|
||||
return solrConfig.render();
|
||||
}
|
||||
|
||||
public static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
|
||||
int replicationFactor, int maxShardsPerNode, String configName) throws SolrServerException, IOException {
|
||||
ModifiableSolrParams modParams = new ModifiableSolrParams();
|
||||
modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
|
||||
modParams.set("name", name);
|
||||
modParams.set("numShards", numShards);
|
||||
modParams.set("replicationFactor", replicationFactor);
|
||||
modParams.set("collection.configName", configName);
|
||||
modParams.set("maxShardsPerNode", maxShardsPerNode);
|
||||
QueryRequest request = new QueryRequest(modParams);
|
||||
request.setPath("/admin/collections");
|
||||
return client.request(request);
|
||||
}
|
||||
|
||||
private static Map<String, byte[]> buildConfiguration(final String layout)
|
||||
throws Exception {
|
||||
|
||||
Map<String, byte[]> res = new HashMap<>();
|
||||
|
||||
try {
|
||||
log.debug("adding schema.xml to the resource map");
|
||||
res.put("schema.xml", getSchemaXML(layout).getBytes());
|
||||
|
||||
res.put("solrconfig.xml", getConfig().getBytes());
|
||||
log.debug("adding solrconfig.xml to the resource map");
|
||||
String data = IOUtils
|
||||
.toString(Objects.requireNonNull(SolrUtil.class.getResourceAsStream(LIST_FILE_BASE_PATH)));
|
||||
Arrays.stream(data.split("\n")).forEach(s -> {
|
||||
final String name = s.replace(CONF_BASE_PATH + "files/", "");
|
||||
res
|
||||
.put(
|
||||
name,
|
||||
Objects.requireNonNull(loadFileInClassPath(s)).getBytes(StandardCharsets.UTF_8));
|
||||
});
|
||||
return res;
|
||||
} catch (Throwable e) {
|
||||
throw new Exception("failed to build configuration", e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String getSchemaXML(final String layout) throws Exception {
|
||||
|
||||
final Document fields = new SAXReader().read(new ByteArrayInputStream(layout.getBytes(StandardCharsets.UTF_8)));
|
||||
|
||||
Transformer transformer = TransformerFactory
|
||||
.newInstance()
|
||||
.newTransformer(
|
||||
new DocumentSource(new SAXReader().read(SolrUtil.class.getResourceAsStream(SCHEMA_TEMPLATE_PATH))));
|
||||
transformer.setParameter("textFieldType", "text_common");
|
||||
|
||||
final DocumentResult result = new DocumentResult();
|
||||
|
||||
transformer.transform(new DocumentSource(fields), result);
|
||||
String xml = result.getDocument().asXML();
|
||||
|
||||
log.debug("new index schema:\n" + xml);
|
||||
|
||||
return xml;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision.scholix;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.sx.scholix.*;
|
||||
|
||||
public class ScholixToSolr implements MapFunction<String, SolrInputDocument> {
|
||||
final static ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
public static SerializableSolrInputDocument toSolrDocument(final String json) {
|
||||
try {
|
||||
final Scholix input = MAPPER.readValue(json, Scholix.class);
|
||||
final SerializableSolrInputDocument output = new SerializableSolrInputDocument();
|
||||
|
||||
fillEntityField(output, input.getSource(), "source");
|
||||
fillEntityField(output, input.getTarget(), "target");
|
||||
final String cleanDate = GraphCleaningFunctions.cleanDate(input.getPublicationDate());
|
||||
|
||||
if (cleanDate != null)
|
||||
output.addField("publication_date", GraphCleaningFunctions.normalizeDate(cleanDate));
|
||||
|
||||
if (input.getRelationship() != null && input.getRelationship().getName() != null)
|
||||
output.addField("relation_name", input.getRelationship().getName());
|
||||
else
|
||||
return null;
|
||||
if (input.getRelationship() != null && input.getRelationship().getInverse() != null)
|
||||
output.addField("relation_inverse", input.getRelationship().getInverse());
|
||||
|
||||
if (input.getLinkprovider() != null) {
|
||||
final List<String> linkProviders = input
|
||||
.getLinkprovider()
|
||||
.stream()
|
||||
.map(ScholixEntityId::getName)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
output.addField("link_provider", linkProviders);
|
||||
}
|
||||
if (input.getPublisher() != null) {
|
||||
final List<String> publishers = input
|
||||
.getPublisher()
|
||||
.stream()
|
||||
.map(ScholixEntityId::getName)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
output.addField("publisher_name", publishers);
|
||||
}
|
||||
|
||||
output.addField("__indexrecordidentifier", input.getIdentifier());
|
||||
output.addField("__result", json);
|
||||
return output;
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Error on convert Scholix");
|
||||
}
|
||||
}
|
||||
|
||||
private static void fillEntityField(final SerializableSolrInputDocument document, final ScholixResource resource,
|
||||
final String prefix) {
|
||||
|
||||
document.addField(prefix + "_identifier", resource.getDnetIdentifier());
|
||||
document.addField(prefix + "_type", resource.getObjectType());
|
||||
document.addField(prefix + "_publication_date", resource.getPublicationDate());
|
||||
document.addField(prefix + "_subtype", resource.getObjectSubType());
|
||||
|
||||
List<String> resourcePIDs = resource
|
||||
.getIdentifier()
|
||||
.stream()
|
||||
.map(ScholixIdentifier::getIdentifier)
|
||||
.collect(Collectors.toList());
|
||||
document.addField(prefix + "_pid", resourcePIDs);
|
||||
|
||||
List<String> resourceSchemas = resource
|
||||
.getIdentifier()
|
||||
.stream()
|
||||
.map(ScholixIdentifier::getSchema)
|
||||
.collect(Collectors.toList());
|
||||
document.addField(prefix + "_schema", resourceSchemas);
|
||||
|
||||
if (resource.getPublisher() != null) {
|
||||
|
||||
final List<String> publishers = resource
|
||||
.getPublisher()
|
||||
.stream()
|
||||
.map(ScholixEntityId::getName)
|
||||
.collect(Collectors.toList());
|
||||
if (publishers.size() > 0)
|
||||
document.addField(prefix + "_publisher", publishers);
|
||||
}
|
||||
|
||||
if (resource.getCollectedFrom() != null) {
|
||||
|
||||
final List<String> collectedFrom = resource
|
||||
.getCollectedFrom()
|
||||
.stream()
|
||||
.map(ScholixCollectedFrom::getProvider)
|
||||
.filter(Objects::nonNull)
|
||||
.map(ScholixEntityId::getName)
|
||||
.collect(Collectors.toList());
|
||||
if (collectedFrom.size() > 0)
|
||||
document.addField(prefix + "_collected_from", collectedFrom);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public SerializableSolrInputDocument call(String s) throws Exception {
|
||||
return toSolrDocument(s);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
|
||||
package eu.dnetlib.dhp.sx.provision;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.lucidworks.spark.util.SolrSupport;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.provision.ProvisionConstants;
|
||||
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
|
||||
import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
|
||||
public class SparkIndexCollectionOnSOLR {
|
||||
|
||||
private static final Integer DEFAULT_BATCH_SIZE = 1000;
|
||||
|
||||
// LOGGER initialized
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkIndexCollectionOnSOLR.class);
|
||||
|
||||
public static void main(String[] args) throws IOException, ParseException {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
Objects
|
||||
.requireNonNull(
|
||||
SparkIndexCollectionOnSOLR.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/sx/provision/index_solr_parameters.json"))));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String cluster = parser.get("cluster");
|
||||
log.info("Cluster is {}", cluster);
|
||||
|
||||
final String format = parser.get("format");
|
||||
log.info("Index format name is {}", format);
|
||||
|
||||
final String isLookupUrl = parser.get("isURL");
|
||||
log.info("isURL is {}", isLookupUrl);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final Integer batchSize = Optional
|
||||
.ofNullable(parser.get("batchSize"))
|
||||
.map(Integer::valueOf)
|
||||
.orElse(DEFAULT_BATCH_SIZE);
|
||||
log.info("batchSize: {}", batchSize);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
conf.registerKryoClasses(new Class[] {
|
||||
SerializableSolrInputDocument.class
|
||||
});
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
|
||||
final String zkHost = isLookup.getZkHost();
|
||||
log.info("zkHost: {}", zkHost);
|
||||
final String collection = ProvisionConstants.getCollectionName(format);
|
||||
log.info("collection: {}", collection);
|
||||
feedScholixToSOLRIndex(spark, inputPath, collection, batchSize, zkHost);
|
||||
});
|
||||
}
|
||||
|
||||
public static void feedScholixToSOLRIndex(final SparkSession spark, final String inputPath, final String collection,
|
||||
Integer batchSize, final String zkHost) {
|
||||
final JavaRDD<SolrInputDocument> docs = spark
|
||||
.read()
|
||||
.text(inputPath)
|
||||
.as(Encoders.STRING())
|
||||
.map(new ScholixToSolr(), Encoders.kryo(SolrInputDocument.class))
|
||||
.toJavaRDD();
|
||||
SolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- Example exchange rates file for CurrencyField type named "currency" in example schema -->
|
||||
|
||||
<currencyConfig version="1.0">
|
||||
<rates>
|
||||
<!-- Updated from http://www.exchangerate.com/ at 2011-09-27 -->
|
||||
<rate from="USD" to="ARS" rate="4.333871" comment="ARGENTINA Peso" />
|
||||
<rate from="USD" to="AUD" rate="1.025768" comment="AUSTRALIA Dollar" />
|
||||
<rate from="USD" to="EUR" rate="0.743676" comment="European Euro" />
|
||||
<rate from="USD" to="BRL" rate="1.881093" comment="BRAZIL Real" />
|
||||
<rate from="USD" to="CAD" rate="1.030815" comment="CANADA Dollar" />
|
||||
<rate from="USD" to="CLP" rate="519.0996" comment="CHILE Peso" />
|
||||
<rate from="USD" to="CNY" rate="6.387310" comment="CHINA Yuan" />
|
||||
<rate from="USD" to="CZK" rate="18.47134" comment="CZECH REP. Koruna" />
|
||||
<rate from="USD" to="DKK" rate="5.515436" comment="DENMARK Krone" />
|
||||
<rate from="USD" to="HKD" rate="7.801922" comment="HONG KONG Dollar" />
|
||||
<rate from="USD" to="HUF" rate="215.6169" comment="HUNGARY Forint" />
|
||||
<rate from="USD" to="ISK" rate="118.1280" comment="ICELAND Krona" />
|
||||
<rate from="USD" to="INR" rate="49.49088" comment="INDIA Rupee" />
|
||||
<rate from="USD" to="XDR" rate="0.641358" comment="INTNL MON. FUND SDR" />
|
||||
<rate from="USD" to="ILS" rate="3.709739" comment="ISRAEL Sheqel" />
|
||||
<rate from="USD" to="JPY" rate="76.32419" comment="JAPAN Yen" />
|
||||
<rate from="USD" to="KRW" rate="1169.173" comment="KOREA (SOUTH) Won" />
|
||||
<rate from="USD" to="KWD" rate="0.275142" comment="KUWAIT Dinar" />
|
||||
<rate from="USD" to="MXN" rate="13.85895" comment="MEXICO Peso" />
|
||||
<rate from="USD" to="NZD" rate="1.285159" comment="NEW ZEALAND Dollar" />
|
||||
<rate from="USD" to="NOK" rate="5.859035" comment="NORWAY Krone" />
|
||||
<rate from="USD" to="PKR" rate="87.57007" comment="PAKISTAN Rupee" />
|
||||
<rate from="USD" to="PEN" rate="2.730683" comment="PERU Sol" />
|
||||
<rate from="USD" to="PHP" rate="43.62039" comment="PHILIPPINES Peso" />
|
||||
<rate from="USD" to="PLN" rate="3.310139" comment="POLAND Zloty" />
|
||||
<rate from="USD" to="RON" rate="3.100932" comment="ROMANIA Leu" />
|
||||
<rate from="USD" to="RUB" rate="32.14663" comment="RUSSIA Ruble" />
|
||||
<rate from="USD" to="SAR" rate="3.750465" comment="SAUDI ARABIA Riyal" />
|
||||
<rate from="USD" to="SGD" rate="1.299352" comment="SINGAPORE Dollar" />
|
||||
<rate from="USD" to="ZAR" rate="8.329761" comment="SOUTH AFRICA Rand" />
|
||||
<rate from="USD" to="SEK" rate="6.883442" comment="SWEDEN Krona" />
|
||||
<rate from="USD" to="CHF" rate="0.906035" comment="SWITZERLAND Franc" />
|
||||
<rate from="USD" to="TWD" rate="30.40283" comment="TAIWAN Dollar" />
|
||||
<rate from="USD" to="THB" rate="30.89487" comment="THAILAND Baht" />
|
||||
<rate from="USD" to="AED" rate="3.672955" comment="U.A.E. Dirham" />
|
||||
<rate from="USD" to="UAH" rate="7.988582" comment="UKRAINE Hryvnia" />
|
||||
<rate from="USD" to="GBP" rate="0.647910" comment="UNITED KINGDOM Pound" />
|
||||
|
||||
<!-- Cross-rates for some common currencies -->
|
||||
<rate from="EUR" to="GBP" rate="0.869914" />
|
||||
<rate from="EUR" to="NOK" rate="7.800095" />
|
||||
<rate from="GBP" to="NOK" rate="8.966508" />
|
||||
</rates>
|
||||
</currencyConfig>
|
|
@ -0,0 +1,42 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- If this file is found in the config directory, it will only be
|
||||
loaded once at startup. If it is found in Solr's data
|
||||
directory, it will be re-loaded every commit.
|
||||
|
||||
See http://wiki.apache.org/solr/QueryElevationComponent for more info
|
||||
|
||||
-->
|
||||
<elevate>
|
||||
<!-- Query elevation examples
|
||||
<query text="foo bar">
|
||||
<doc id="1" />
|
||||
<doc id="2" />
|
||||
<doc id="3" />
|
||||
</query>
|
||||
|
||||
for use with techproducts example
|
||||
|
||||
<query text="ipod">
|
||||
<doc id="MA147LL/A" /> put the actual ipod at the top
|
||||
<doc id="IW-02" exclude="true" /> exclude this cable
|
||||
</query>
|
||||
-->
|
||||
|
||||
</elevate>
|
|
@ -0,0 +1,6 @@
|
|||
/eu/dnetlib/dhp/oa/provision/conf/files/currency.xml
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/elevate.xml
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/params.json
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/protwords.txt
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/stopwords.txt
|
||||
/eu/dnetlib/dhp/oa/provision/conf/files/synonyms.txt
|
|
@ -0,0 +1,20 @@
|
|||
{"params":{
|
||||
"query":{
|
||||
"defType":"edismax",
|
||||
"q.alt":"*:*",
|
||||
"rows":"10",
|
||||
"fl":"*,score",
|
||||
"":{"v":0}
|
||||
},
|
||||
"facets":{
|
||||
"facet":"on",
|
||||
"facet.mincount": "1",
|
||||
"":{"v":0}
|
||||
},
|
||||
"velocity":{
|
||||
"wt": "velocity",
|
||||
"v.template":"browse",
|
||||
"v.layout": "layout",
|
||||
"":{"v":0}
|
||||
}
|
||||
}}
|
|
@ -0,0 +1,21 @@
|
|||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
# Use a protected word file to protect against the stemmer reducing two
|
||||
# unrelated words to the same base word.
|
||||
|
||||
# Some non-words that normally won't be encountered,
|
||||
# just to test that they won't be stemmed.
|
||||
dontstems
|
||||
zwhacky
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
a
|
||||
an
|
||||
and
|
||||
are
|
||||
as
|
||||
at
|
||||
be
|
||||
but
|
||||
by
|
||||
for
|
||||
if
|
||||
in
|
||||
into
|
||||
is
|
||||
it
|
||||
no
|
||||
not
|
||||
of
|
||||
on
|
||||
or
|
||||
s
|
||||
such
|
||||
t
|
||||
that
|
||||
the
|
||||
their
|
||||
then
|
||||
there
|
||||
these
|
||||
they
|
||||
this
|
||||
to
|
||||
was
|
||||
will
|
||||
with
|
|
@ -0,0 +1,29 @@
|
|||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
#some test synonym mappings unlikely to appear in real input text
|
||||
aaafoo => aaabar
|
||||
bbbfoo => bbbfoo bbbbar
|
||||
cccfoo => cccbar cccbaz
|
||||
fooaaa,baraaa,bazaaa
|
||||
|
||||
# Some synonym groups specific to this example
|
||||
GB,gib,gigabyte,gigabytes
|
||||
MB,mib,megabyte,megabytes
|
||||
Television, Televisions, TV, TVs
|
||||
#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
|
||||
#after us won't split it into two words.
|
||||
|
||||
# Synonym mappings can be used for spelling correction too
|
||||
pixima => pixma
|
||||
|
|
@ -0,0 +1,549 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||
<xsl:output omit-xml-declaration="yes" indent="yes"/>
|
||||
|
||||
<xsl:template match="//FIELDS">
|
||||
|
||||
<xsl:param name="textFieldType" select="string('text_common')"/>
|
||||
<xsl:variable name="smallcase" select="'abcdefghijklmnopqrstuvwxyz'"/>
|
||||
<xsl:variable name="uppercase" select="'ABCDEFGHIJKLMNOPQRSTUVWXYZ'"/>
|
||||
|
||||
<!--
|
||||
D-Net index schema template
|
||||
|
||||
CHANGELOG
|
||||
|
||||
0.1 : first release
|
||||
0.2 : added preserveOriginal="1" for text field type in the index analyzer and catenateWords="1" for the query analyzer
|
||||
0.3 : changed language for SnowballPorterFilterFactory to language="German2" (index/query) in the text field type
|
||||
0.4 : added solr.ASCIIFoldingFilterFactory filter (index/query) in the text field type
|
||||
0.5 : added long_keyword field type, to be used for objIdentifiers
|
||||
0.6 : added field types for spellchecking
|
||||
0.7 : added parameter for text field type
|
||||
0.8 : added field _version_, needed by Solr 4.0.0 for the transaction log
|
||||
0.9 : added type: text_en_splitting
|
||||
0.91 : added type: ngramtext
|
||||
0.92 : added schema optimizations, removing unnecessary stored fields
|
||||
0.93 : added attribute preserveOriginal="1" to fieldtype ngramtext (query analysis) to improve matches
|
||||
0.94 : updated and simplified ngramtext fieldtype
|
||||
0.95 : update to solr 4.4, removed attribute "compress" from field definition, ngramfield doesn't support NGramFilterFactory anymore
|
||||
0.96 : update to solr 4.9
|
||||
0.97 : introduced field type string_ci supporting case insensitivity.
|
||||
1.0 : updated to solr 6.6.0
|
||||
-->
|
||||
<schema name="dnet" version="1.0">
|
||||
|
||||
<!-- Valid attributes for fields:
|
||||
name: mandatory - the name for the field
|
||||
type: mandatory - the name of a field type from the
|
||||
fieldTypes section
|
||||
indexed: true if this field should be indexed (searchable or sortable)
|
||||
stored: true if this field should be retrievable
|
||||
docValues: true if this field should have doc values. Doc values are
|
||||
useful (required, if you are using *Point fields) for faceting,
|
||||
grouping, sorting and function queries. Doc values will make the index
|
||||
faster to load, more NRT-friendly and more memory-efficient.
|
||||
They however come with some limitations: they are currently only
|
||||
supported by StrField, UUIDField, all Trie*Fields and *PointFields,
|
||||
and depending on the field type, they might require the field to be
|
||||
single-valued, be required or have a default value (check the
|
||||
documentation of the field type you're interested in for more information)
|
||||
multiValued: true if this field may contain multiple values per document
|
||||
omitNorms: (expert) set to true to omit the norms associated with
|
||||
this field (this disables length normalization and index-time
|
||||
boosting for the field, and saves some memory). Only full-text
|
||||
fields or fields that need an index-time boost need norms.
|
||||
Norms are omitted for primitive (non-analyzed) types by default.
|
||||
termVectors: [false] set to true to store the term vector for a
|
||||
given field.
|
||||
When using MoreLikeThis, fields used for similarity should be
|
||||
stored for best performance.
|
||||
termPositions: Store position information with the term vector.
|
||||
This will increase storage costs.
|
||||
termOffsets: Store offset information with the term vector. This
|
||||
will increase storage costs.
|
||||
required: The field is required. It will throw an error if the
|
||||
value does not exist
|
||||
default: a value that should be used if no value is specified
|
||||
when adding a document.
|
||||
-->
|
||||
|
||||
<!-- field names should consist of alphanumeric or underscore characters only and
|
||||
not start with a digit. This is not currently strictly enforced,
|
||||
but other field names will not have first class support from all components
|
||||
and back compatibility is not guaranteed. Names with both leading and
|
||||
trailing underscores (e.g. _version_) are reserved.
|
||||
-->
|
||||
|
||||
<xsl:for-each select="./FIELD">
|
||||
<xsl:variable name="fieldname" select="translate(@name, $uppercase, $smallcase)"/>
|
||||
<xsl:variable name="fieldtype">
|
||||
<xsl:choose>
|
||||
<xsl:when test="@type"><xsl:value-of select="@type"/></xsl:when>
|
||||
<xsl:when test="@tokenizable='false'">string</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="$textFieldType"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:variable>
|
||||
<xsl:variable name="isMultivalued">
|
||||
<xsl:choose>
|
||||
<xsl:when test="@multivalued='false'">false</xsl:when>
|
||||
<xsl:otherwise>true</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:variable>
|
||||
<xsl:variable name="isStored">
|
||||
<xsl:choose>
|
||||
<xsl:when test="@stored='true'">true</xsl:when>
|
||||
<xsl:otherwise>false</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:variable>
|
||||
|
||||
<field name="{$fieldname}" type="{$fieldtype}" indexed="{@indexable}" stored="{normalize-space($isStored)}" multiValued="{normalize-space($isMultivalued)}"/>
|
||||
</xsl:for-each>
|
||||
|
||||
<field name="__indexrecordidentifier" type="string" indexed="true" stored="true" multiValued="false" required="true"/>
|
||||
|
||||
<field name="__deleted" type="boolean" indexed="true" stored="false" default="false" omitNorms="true" omitTermFreqAndPositions="true"/>
|
||||
|
||||
<field name="__dsid" type="string" indexed="true" stored="true" omitNorms="true" omitTermFreqAndPositions="true"/>
|
||||
|
||||
<field name="__dsversion" type="pdate" indexed="true" stored="true" omitNorms="true" omitTermFreqAndPositions="true"/>
|
||||
|
||||
<field name="__result" type="string" indexed="false" stored="true" multiValued="false" docValues="false"/>
|
||||
|
||||
<field name="__all" type="{$textFieldType}" indexed="true" stored="false" multiValued="true"/>
|
||||
|
||||
<field name="_version_" type="long" indexed="true" stored="true" multiValued="false" />
|
||||
|
||||
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
|
||||
|
||||
<!-- field for ping -->
|
||||
<field name="text" type="{$textFieldType}" indexed="false" stored="false"/>
|
||||
|
||||
<!-- Field to use to determine and enforce document uniqueness.
|
||||
Unless this field is marked with required="false", it will be a required field
|
||||
-->
|
||||
<uniqueKey>__indexrecordidentifier</uniqueKey>
|
||||
|
||||
<xsl:for-each select="./FIELD[@copy = 'true']">
|
||||
<xsl:variable name="fieldname" select="translate(@name, $uppercase, $smallcase)"/>
|
||||
<copyField source="{$fieldname}" dest="__all"/>
|
||||
</xsl:for-each>
|
||||
|
||||
<!-- copyField commands copy one field to another at the time a document
|
||||
is added to the index. It's used either to index the same field differently,
|
||||
or to add multiple fields to the same field for easier/faster searching.
|
||||
|
||||
<copyField source="sourceFieldName" dest="destinationFieldName"/>
|
||||
-->
|
||||
|
||||
<!-- field type definitions. The "name" attribute is
|
||||
just a label to be used by field definitions. The "class"
|
||||
attribute and any other attributes determine the real
|
||||
behavior of the fieldType.
|
||||
Class names starting with "solr" refer to java classes in a
|
||||
standard package such as org.apache.solr.analysis
|
||||
-->
|
||||
|
||||
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
|
||||
It supports doc values but in that case the field needs to be
|
||||
single-valued and either required or have a default value.
|
||||
-->
|
||||
<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
|
||||
<fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true" docValues="true" />
|
||||
|
||||
<!-- boolean type: "true" or "false" -->
|
||||
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
|
||||
|
||||
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
|
||||
|
||||
<!-- sortMissingLast and sortMissingFirst attributes are optional attributes are
|
||||
currently supported on types that are sorted internally as strings
|
||||
and on numeric types.
|
||||
This includes "string","boolean", "int", "float", "long", "date", "double",
|
||||
including the "Trie" and "Point" variants.
|
||||
- If sortMissingLast="true", then a sort on this field will cause documents
|
||||
without the field to come after documents with the field,
|
||||
regardless of the requested sort order (asc or desc).
|
||||
- If sortMissingFirst="true", then a sort on this field will cause documents
|
||||
without the field to come before documents with the field,
|
||||
regardless of the requested sort order.
|
||||
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
|
||||
then default lucene sorting will be used which places docs without the
|
||||
field first in an ascending sort and last in a descending sort.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Numeric field types that index values using KD-trees. *Point fields are faster and more efficient than Trie* fields both, at
|
||||
search time and at index time, but some features are still not supported.
|
||||
Point fields don't support FieldCache, so they must have docValues="true" if needed for sorting, faceting, functions, etc.
|
||||
-->
|
||||
<fieldType name="pint" class="solr.IntPointField" docValues="true"/>
|
||||
<fieldType name="pfloat" class="solr.FloatPointField" docValues="true"/>
|
||||
<fieldType name="plong" class="solr.LongPointField" docValues="true"/>
|
||||
<fieldType name="pdouble" class="solr.DoublePointField" docValues="true"/>
|
||||
|
||||
<fieldType name="pints" class="solr.IntPointField" docValues="true" multiValued="true"/>
|
||||
<fieldType name="pfloats" class="solr.FloatPointField" docValues="true" multiValued="true"/>
|
||||
<fieldType name="plongs" class="solr.LongPointField" docValues="true" multiValued="true"/>
|
||||
<fieldType name="pdoubles" class="solr.DoublePointField" docValues="true" multiValued="true"/>
|
||||
|
||||
<!--
|
||||
Default numeric field types. For faster range queries, consider *PointFields (pint/pfloat/plong/pdouble), or the
|
||||
tint/tfloat/tlong/tdouble types.
|
||||
-->
|
||||
<fieldType name="int" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="float" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="double" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
||||
|
||||
<fieldType name="ints" class="solr.TrieIntField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
||||
<fieldType name="floats" class="solr.TrieFloatField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
||||
<fieldType name="longs" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
||||
<fieldType name="doubles" class="solr.TrieDoubleField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
||||
|
||||
<!--
|
||||
Numeric field types that index each value at various levels of precision
|
||||
to accelerate range queries when the number of values between the range
|
||||
endpoints is large. See the javadoc for NumericRangeQuery for internal
|
||||
implementation details.
|
||||
|
||||
Smaller precisionStep values (specified in bits) will lead to more tokens
|
||||
indexed per value, slightly larger index size, and faster range queries.
|
||||
A precisionStep of 0 disables indexing at different precision levels.
|
||||
|
||||
Consider using pint/pfloat/plong/pdouble instead of Trie* fields if possible
|
||||
-->
|
||||
<fieldType name="tint" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
|
||||
<fieldType name="tfloat" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
|
||||
<fieldType name="tlong" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
|
||||
<fieldType name="tdouble" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0"/>
|
||||
|
||||
<fieldType name="tints" class="solr.TrieIntField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
|
||||
<fieldType name="tfloats" class="solr.TrieFloatField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
|
||||
<fieldType name="tlongs" class="solr.TrieLongField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
|
||||
<fieldType name="tdoubles" class="solr.TrieDoubleField" docValues="true" precisionStep="8" positionIncrementGap="0" multiValued="true"/>
|
||||
|
||||
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
|
||||
is a more restricted form of the canonical representation of dateTime
|
||||
http://www.w3.org/TR/xmlschema-2/#dateTime
|
||||
The trailing "Z" designates UTC time and is mandatory.
|
||||
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
|
||||
All other components are mandatory.
|
||||
|
||||
Expressions can also be used to denote calculations that should be
|
||||
performed relative to "NOW" to determine the value, ie...
|
||||
|
||||
NOW/HOUR
|
||||
... Round to the start of the current hour
|
||||
NOW-1DAY
|
||||
... Exactly 1 day prior to now
|
||||
NOW/DAY+6MONTHS+3DAYS
|
||||
... 6 months and 3 days in the future from the start of
|
||||
the current day
|
||||
|
||||
Consult the TrieDateField javadocs for more information.
|
||||
-->
|
||||
<!-- KD-tree versions of date fields -->
|
||||
<fieldType name="pdate" class="solr.DatePointField" docValues="true"/>
|
||||
<fieldType name="pdates" class="solr.DatePointField" docValues="true" multiValued="true"/>
|
||||
|
||||
<fieldType name="date" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="dates" class="solr.TrieDateField" docValues="true" precisionStep="0" positionIncrementGap="0" multiValued="true"/>
|
||||
|
||||
<fieldType name="tdate" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0"/>
|
||||
<fieldType name="tdates" class="solr.TrieDateField" docValues="true" precisionStep="6" positionIncrementGap="0" multiValued="true"/>
|
||||
|
||||
|
||||
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
|
||||
<fieldType name="binary" class="solr.BinaryField"/>
|
||||
|
||||
<!-- The "RandomSortField" is not used to store or search any
|
||||
data. You can declare fields of this type it in your schema
|
||||
to generate pseudo-random orderings of your docs for sorting
|
||||
or function purposes. The ordering is generated based on the field
|
||||
name and the version of the index. As long as the index version
|
||||
remains unchanged, and the same field name is reused,
|
||||
the ordering of the docs will be consistent.
|
||||
If you want different psuedo-random orderings of documents,
|
||||
for the same version of the index, use a dynamicField and
|
||||
change the field name in the request.
|
||||
-->
|
||||
<fieldType name="random" class="solr.RandomSortField" indexed="true" />
|
||||
|
||||
<!-- solr.TextField allows the specification of custom text analyzers
|
||||
specified as a tokenizer and a list of token filters. Different
|
||||
analyzers may be specified for indexing and querying.
|
||||
|
||||
The optional positionIncrementGap puts space between multiple fields of
|
||||
this type on the same document, with the purpose of preventing false phrase
|
||||
matching across fields.
|
||||
|
||||
For more info on customizing your analyzer chain, please see
|
||||
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
|
||||
-->
|
||||
|
||||
<!-- One can also specify an existing Analyzer class that has a
|
||||
default constructor via the class attribute on the analyzer element.
|
||||
Example:
|
||||
<fieldType name="text_greek" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
|
||||
</fieldType>
|
||||
-->
|
||||
|
||||
<!-- A text field that only splits on whitespace for exact matching of words -->
|
||||
<!-- <dynamicField name="*_ws" type="text_ws" indexed="true" stored="true"/> -->
|
||||
|
||||
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="ngramtext" class="solr.TextField">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.NGramFilterFactory" minGramSize="3" maxGramSize="25"/>
|
||||
<filter class="solr.TrimFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<fieldType name="personName" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="personNamePrefix" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
<filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="30" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!-- A general text field that has reasonable, generic
|
||||
cross-language defaults: it tokenizes with StandardTokenizer,
|
||||
removes stop words from case-insensitive "stopwords.txt"
|
||||
(empty by default), and down cases. At query time only, it
|
||||
also applies synonyms.
|
||||
-->
|
||||
<fieldType name="text_common" class="solr.TextField" positionIncrementGap="100" multiValued="true">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||
<!-- in this example, we will only use synonyms at query time
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
<filter class="solr.FlattenGraphFilterFactory"/>
|
||||
-->
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPossessiveFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
|
||||
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.EnglishPossessiveFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- A text field with defaults appropriate for English, plus
|
||||
aggressive word-splitting and autophrase features enabled.
|
||||
This field is just like text_en, except it adds
|
||||
WordDelimiterGraphFilter to enable splitting and matching of
|
||||
words on case-change, alpha numeric boundaries, and
|
||||
non-alphanumeric chars. This means certain compound word
|
||||
cases will work, for example query "wi fi" will match
|
||||
document "WiFi" or "wi-fi".
|
||||
-->
|
||||
<!-- <dynamicField name="*_txt_en_split" type="text_en_splitting" indexed="true" stored="true"/> -->
|
||||
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<!-- in this example, we will only use synonyms at query time
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<!-- Case insensitive stop word removal.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="stopwords.txt"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
<filter class="solr.FlattenGraphFilterFactory" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="stopwords.txt"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
|
||||
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
|
||||
<!-- <dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true"/> -->
|
||||
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.EnglishMinimalStemFilterFactory"/>
|
||||
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
|
||||
possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
<filter class="solr.FlattenGraphFilterFactory" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.EnglishMinimalStemFilterFactory"/>
|
||||
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
|
||||
possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- Just like text_common except it reverses the characters of
|
||||
each token, to enable more efficient leading wildcard queries.
|
||||
-->
|
||||
<!-- <dynamicField name="*_txt_rev" type="text_common_rev" indexed="true" stored="true"/> -->
|
||||
<fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
|
||||
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- <dynamicField name="*_phon_en" type="phonetic_en" indexed="true" stored="true"/> -->
|
||||
<fieldType name="phonetic_en" stored="false" indexed="true" class="solr.TextField" >
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="string_ci" class="solr.TextField" sortMissingLast="true" omitNorms="true">
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!--
|
||||
Example of using PathHierarchyTokenizerFactory at index time, so
|
||||
queries for paths match documents at that path, or in descendent paths
|
||||
-->
|
||||
<!-- <dynamicField name="*_descendent_path" type="descendent_path" indexed="true" stored="true"/> -->
|
||||
<fieldType name="descendent_path" class="solr.TextField">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.KeywordTokenizerFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!--
|
||||
Example of using PathHierarchyTokenizerFactory at query time, so
|
||||
queries for paths match documents at that path, or in ancestor paths
|
||||
-->
|
||||
<!-- <dynamicField name="*_ancestor_path" type="ancestor_path" indexed="true" stored="true"/> -->
|
||||
<fieldType name="ancestor_path" class="solr.TextField">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.KeywordTokenizerFactory" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- since fields of this type are by default not stored or indexed,
|
||||
any data added to them will be ignored outright. -->
|
||||
<fieldType name="ignored" stored="false" indexed="false" docValues="false" multiValued="true" class="solr.StrField" />
|
||||
|
||||
<!-- This point type indexes the coordinates as separate fields (subFields)
|
||||
If subFieldType is defined, it references a type, and a dynamic field
|
||||
definition is created matching *___<typename>. Alternately, if
|
||||
subFieldSuffix is defined, that is used to create the subFields.
|
||||
Example: if subFieldType="double", then the coordinates would be
|
||||
indexed in fields myloc_0___double,myloc_1___double.
|
||||
Example: if subFieldSuffix="_d" then the coordinates would be indexed
|
||||
in fields myloc_0_d,myloc_1_d
|
||||
The subFields are an implementation detail of the fieldType, and end
|
||||
users normally should not need to know about them.
|
||||
-->
|
||||
<!-- <dynamicField name="*_point" type="point" indexed="true" stored="true"/> -->
|
||||
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
|
||||
|
||||
<!-- A specialized field for geospatial search filters and distance sorting. -->
|
||||
<fieldType name="location" class="solr.LatLonPointSpatialField" docValues="true"/>
|
||||
|
||||
<!-- An alternative geospatial field type new to Solr 4. It supports multiValued and polygon shapes.
|
||||
For more information about this and other Spatial fields new to Solr 4, see:
|
||||
http://wiki.apache.org/solr/SolrAdaptersForLuceneSpatial4
|
||||
-->
|
||||
<fieldType name="location_rpt" class="solr.SpatialRecursivePrefixTreeFieldType"
|
||||
geo="true" distErrPct="0.025" maxDistErr="0.001" distanceUnits="kilometers" />
|
||||
|
||||
</schema>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"id":"solr",
|
||||
"address":"localhost:9983",
|
||||
"port":"8983",
|
||||
"webContext":"solr",
|
||||
"numShards":"4",
|
||||
"replicationFactor":"1",
|
||||
"maxShardsPerNode":"4",
|
||||
"host":"localhost",
|
||||
"luceneMatchVersion":"7.5.0",
|
||||
"feedingShutdownTolerance":"30000",
|
||||
"feedingBufferFlushThreshold":"1000",
|
||||
"feedingSimulationMode":"false"
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
<FIELDS><!-- SOURCE FIELD -->
|
||||
<FIELD indexable="true" name="source_identifier" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_type" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="false" name="source_publication_date" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_subType" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_pid" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_schema" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_publisher" tokenizable="true" stored="true" stat="false" xpath="None"/>
|
||||
<FIELD indexable="true" name="source_collected_from" tokenizable="true" stored="true" stat="false" xpath="None"/><!-- TARGET FIELD -->
|
||||
<FIELD indexable="true" name="target_identifier" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_type" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_subType" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_pid" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_schema" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_publisher" tokenizable="true" stored="true" stat="false" xpath="None"/>
|
||||
<FIELD indexable="true" name="target_collected_from" tokenizable="true" stored="true" stat="false" xpath="None"/>
|
||||
<FIELD indexable="false" name="target_publication_date" stored="true" stat="false" tokenizable="false" value="None"/><!-- RELATION FIELD -->
|
||||
<FIELD indexable="true" name="publicationDate" multivalued="false" stored="true" stat="false" type="pdate" value="None"/>
|
||||
<FIELD indexable="true" name="relation_name" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="relation_inverse" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="publisher_name" tokenizable="ture" stored="true" stat="false" xpath="None"/>
|
||||
<FIELD indexable="true" name="linkprovider" tokenizable="ture" stored="true" stat="false" xpath="None"/>
|
||||
</FIELDS>
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName":"c",
|
||||
"paramLongName":"cluster",
|
||||
"paramDescription":"should be cluster1 or cluster2",
|
||||
"paramRequired":true
|
||||
},
|
||||
{
|
||||
"paramName":"is",
|
||||
"paramLongName":"isURL",
|
||||
"paramDescription":"the Information Service LookUp URL",
|
||||
"paramRequired":true
|
||||
},
|
||||
{
|
||||
"paramName":"ip",
|
||||
"paramLongName":"inputPath",
|
||||
"paramDescription":"the source input path",
|
||||
"paramRequired":true
|
||||
},
|
||||
{
|
||||
"paramName":"b",
|
||||
"paramLongName":"batchSize",
|
||||
"paramDescription":"the batch size param",
|
||||
"paramRequired":false
|
||||
},
|
||||
{
|
||||
"paramName":"f",
|
||||
"paramLongName":"format",
|
||||
"paramDescription":"index metadata format name",
|
||||
"paramRequired":true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,14 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,113 @@
|
|||
<workflow-app name="Index Scholexplorer Infospace" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the sourcePath of the json RDDs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>URL for the isLookup service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>solrDeletionQuery</name>
|
||||
<value>*:*</value>
|
||||
<description>query used in the deleted by query operation</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>format</name>
|
||||
<description>metadata format name (SMF)</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="indexScholix"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="drop_solr_collection">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>DELETE_BY_QUERY</arg>
|
||||
<arg>--query</arg><arg>${solrDeletionQuery}</arg>
|
||||
<arg>--commit</arg><arg>true</arg>
|
||||
</java>
|
||||
<ok to="create_solr_index"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="create_solr_index">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>CREATE</arg>
|
||||
|
||||
</java>
|
||||
<ok to="indexScholix"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="indexScholix">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Index summary</name>
|
||||
<class>eu.dnetlib.dhp.sx.provision.SparkIndexCollectionOnSOLR</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--conf spark.dynamicAllocation.enabled=true
|
||||
--conf spark.shuffle.service.enabled=true
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="16"
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--cluster</arg><arg>yarn</arg>
|
||||
<arg>--isURL</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--inputPath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
|
||||
</spark>
|
||||
<ok to="commit_solr_collection"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="commit_solr_collection">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,185 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.MapSolrParams;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import eu.dnetlib.dhp.oa.provision.scholix.ScholixToSolr;
|
||||
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
public class ScholixIndexingTest extends SolrTest {
|
||||
|
||||
private static String LAYOUT_PATH = "/eu/dnetlib/dhp/oa/provision/SMF_layout.xml";
|
||||
|
||||
/**
|
||||
* This test verifies that the schema will be generated correctly
|
||||
* by get the profile of the metadataFormat and generating solr schema.xml
|
||||
* we expect that the fiedl in the metadataFormat are all in the field solr schema
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test
|
||||
@Order(1)
|
||||
void testSchemaCreation() throws Exception {
|
||||
|
||||
final String layout = loadSMFLayout();
|
||||
assertNotNull(layout);
|
||||
assertTrue(StringUtils.isNotBlank(layout));
|
||||
|
||||
final String scheme = SolrUtil.getSchemaXML(loadSMFLayout());
|
||||
assertNotNull(scheme);
|
||||
assertTrue(StringUtils.isNotBlank(scheme));
|
||||
|
||||
final Document fields = parseDocument(layout);
|
||||
List<Node> params = fields.selectNodes("//FIELD");
|
||||
final List<String> exptectedFieldName = new ArrayList<>();
|
||||
for (Node param : params) {
|
||||
Element element = (Element) param;
|
||||
String name = element.attributeValue("name");
|
||||
exptectedFieldName.add(name.toLowerCase());
|
||||
}
|
||||
assertTrue(exptectedFieldName.size() > 0);
|
||||
|
||||
final Document parsedScheme = parseDocument(scheme);
|
||||
params = parsedScheme.selectNodes("//field");
|
||||
final List<String> createdFieldName = new ArrayList<>();
|
||||
for (Node param : params) {
|
||||
|
||||
Element element = (Element) param;
|
||||
String name = element.attributeValue("name");
|
||||
createdFieldName.add(name.toLowerCase());
|
||||
}
|
||||
assertTrue(createdFieldName.size() > 0);
|
||||
|
||||
exptectedFieldName.stream().map(createdFieldName::contains).forEach(Assertions::assertTrue);
|
||||
}
|
||||
|
||||
/***
|
||||
* Test the creation of the index works
|
||||
* we test if all the files are uploaded into
|
||||
* the zookeeper instance of SOLR under it's
|
||||
* collection name
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test
|
||||
@Order(2)
|
||||
public void testCreateCollection() throws Exception {
|
||||
final String collectionName = "SMF-index-scholix";
|
||||
SolrUtil.uploadZookeperConfig(miniCluster.getZkClient(), collectionName, true, loadSMFLayout());
|
||||
|
||||
assertTrue(miniCluster.getZkClient().exists("/configs/" + collectionName, true));
|
||||
List<String> items = miniCluster.getZkClient().getChildren("/configs/" + collectionName, null, true);
|
||||
|
||||
List<String> configurationFiles = Files
|
||||
.list(
|
||||
Paths
|
||||
.get(
|
||||
Objects.requireNonNull(getClass().getResource(SolrUtil.CONF_BASE_PATH + "files/")).getPath()))
|
||||
.map(Path::getFileName)
|
||||
.filter(p -> !p.getFileName().toString().equalsIgnoreCase("file_list"))
|
||||
.map(Path::toString)
|
||||
.collect(Collectors.toList());
|
||||
configurationFiles.add("schema.xml");
|
||||
configurationFiles.add("solrconfig.xml");
|
||||
configurationFiles.forEach(s -> assertTrue(items.contains(s)));
|
||||
|
||||
SolrUtil.createCollection(miniCluster.getSolrClient(), "Scholix", 4, 1, 2, collectionName);
|
||||
|
||||
log.debug("Collection Created");
|
||||
final Map<String, String> queryParamMap = new HashMap<>();
|
||||
queryParamMap.put("q", "*:*");
|
||||
|
||||
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
|
||||
final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
|
||||
final SolrDocumentList documents = response.getResults();
|
||||
assertEquals(0, documents.getNumFound());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(3)
|
||||
public void testFeedingSolrDocument() throws Exception {
|
||||
|
||||
InputStream gzipStream = new GZIPInputStream(
|
||||
Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/provision/scholix_records.gz")));
|
||||
Reader decoder = new InputStreamReader(gzipStream, StandardCharsets.UTF_8);
|
||||
BufferedReader buffered = new BufferedReader(decoder);
|
||||
String line = buffered.readLine();
|
||||
|
||||
final CloudSolrClient client = miniCluster.getSolrClient();
|
||||
client.setDefaultCollection("Scholix");
|
||||
int added = 0;
|
||||
while (line != null) {
|
||||
|
||||
final SolrInputDocument solrDocument = ScholixToSolr.toSolrDocument(line);
|
||||
|
||||
client.add(solrDocument);
|
||||
added++;
|
||||
line = buffered.readLine();
|
||||
}
|
||||
|
||||
client.commit();
|
||||
|
||||
log.debug(String.format("Feed %d documents", added));
|
||||
|
||||
final SolrDocumentList documents = executeQuery("*:*");
|
||||
assertEquals(added, documents.getNumFound());
|
||||
|
||||
documents.stream().map(s -> s.getFirstValue("source_pid").toString()).forEach(System.out::println);
|
||||
|
||||
SolrDocumentList source_pids = executeQuery("source_pid:\"10.15468/dl.u47azs\"");
|
||||
|
||||
System.out.println("source_pid.getNumFound() = " + source_pids.getNumFound());
|
||||
|
||||
source_pids.stream().map(s -> s.getFieldValue("source_pid")).forEach(System.out::println);
|
||||
|
||||
}
|
||||
|
||||
private SolrDocumentList executeQuery(final String query) throws SolrServerException, IOException {
|
||||
|
||||
final Map<String, String> queryParamMap = new HashMap<>();
|
||||
queryParamMap.put("q", query);
|
||||
|
||||
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
|
||||
final QueryResponse response = miniCluster.getSolrClient().query("Scholix", queryParams);
|
||||
return response.getResults();
|
||||
}
|
||||
|
||||
/***
|
||||
* Utility for parsing XML
|
||||
* @param xml
|
||||
* @return Dom4J Document
|
||||
* @throws DocumentException
|
||||
*/
|
||||
private Document parseDocument(final String xml) throws DocumentException {
|
||||
return new SAXReader().read(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
|
||||
}
|
||||
|
||||
private String loadSMFLayout() throws IOException {
|
||||
return IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream(LAYOUT_PATH)));
|
||||
}
|
||||
|
||||
}
|
|
@ -2,11 +2,9 @@
|
|||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.apache.solr.client.solrj.response.SolrPingResponse;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class SolrAdminApplicationTest extends SolrTest {
|
||||
|
@ -24,7 +22,7 @@ class SolrAdminApplicationTest extends SolrTest {
|
|||
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
|
||||
|
||||
UpdateResponse rsp = (UpdateResponse) admin
|
||||
.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, DEFAULT_COLLECTION, "*:*", false);
|
||||
.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, DEFAULT_COLLECTION, "*:*", false, null);
|
||||
|
||||
assertEquals(0, rsp.getStatus());
|
||||
}
|
||||
|
@ -38,5 +36,4 @@ class SolrAdminApplicationTest extends SolrTest {
|
|||
|
||||
assertEquals(0, rsp.getStatus());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
<LAYOUT name="index">
|
||||
<FIELDS>
|
||||
|
||||
<!-- SOURCE FIELD -->
|
||||
<FIELD indexable="true" name="source_identifier" multivalued="false" stored="false" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_type" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="false" name="source_publication_date" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_subtype" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_pid" multivalued="true" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_schema" multivalued="true" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="source_publisher" multivalued="true" tokenizable="true" stored="true" stat="false" xpath="None"/>
|
||||
<FIELD indexable="true" name="source_collected_from" multivalued="true" tokenizable="true" stored="true" stat="false" xpath="None"/>
|
||||
|
||||
<!-- TARGET FIELD -->
|
||||
<FIELD indexable="true" name="target_identifier" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_type" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="false" name="target_publication_date" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_subtype" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_pid" multivalued="true" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_schema" multivalued="true" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="target_publisher" multivalued="true" tokenizable="true" stored="true" stat="false" xpath="None"/>
|
||||
<FIELD indexable="true" name="target_collected_from" multivalued="true" tokenizable="true" stored="true" stat="false" xpath="None"/>
|
||||
|
||||
<!-- RELATION FIELD -->
|
||||
<FIELD indexable="true" name="publication_date" multivalued="false" stored="true" stat="false" type="date" value="None"/>
|
||||
<FIELD indexable="true" name="relation_name" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="relation_inverse" multivalued="false" stored="true" stat="false" tokenizable="false" value="None"/>
|
||||
<FIELD indexable="true" name="publisher_name" multivalued="true" tokenizable="ture" stored="true" stat="false" xpath="None"/>
|
||||
<FIELD indexable="true" name="link_provider" multivalued="true" tokenizable="ture" stored="true" stat="false" xpath="None"/>
|
||||
</FIELDS>
|
||||
</LAYOUT>
|
Binary file not shown.
|
@ -352,7 +352,9 @@
|
|||
</goals>
|
||||
<configuration>
|
||||
<tasks>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<property name="assembly-resources.loc" value="${maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path}" />
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<unjar src="${assembly-resources.loc}" dest="${project.build.directory}/assembly-resources" />
|
||||
</tasks>
|
||||
</configuration>
|
||||
|
@ -427,9 +429,12 @@
|
|||
<configuration>
|
||||
<executable>ssh</executable>
|
||||
<arguments>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
|
||||
<argument>-o StrictHostKeyChecking=no</argument>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>rm -rf ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; mkdir -p ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/</argument>
|
||||
</arguments>
|
||||
</configuration>
|
||||
|
@ -443,9 +448,11 @@
|
|||
<configuration>
|
||||
<executable>scp</executable>
|
||||
<arguments>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>-P ${dhp.hadoop.frontend.port.ssh}</argument>
|
||||
<argument>-o StrictHostKeyChecking=no</argument>
|
||||
<argument>target/${oozie.package.file.name}.tar.gz</argument>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}:${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/${oozie.package.file.name}.tar.gz</argument>
|
||||
</arguments>
|
||||
</configuration>
|
||||
|
@ -460,11 +467,15 @@
|
|||
<executable>ssh</executable>
|
||||
<!-- <outputFile>target/redirected_upload.log</outputFile> -->
|
||||
<arguments>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
|
||||
<argument>-o StrictHostKeyChecking=no</argument>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
|
||||
<argument>tar -zxf oozie-package.tar.gz; </argument>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>rm ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/oozie-package.tar.gz; </argument>
|
||||
<argument>./upload_workflow.sh</argument>
|
||||
</arguments>
|
||||
|
@ -495,9 +506,12 @@
|
|||
<!-- this file will be used by test verification profile reading job identifier -->
|
||||
<outputFile>${oozie.execution.log.file.location}</outputFile>
|
||||
<arguments>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>${dhp.hadoop.frontend.user.name}@${dhp.hadoop.frontend.host.name}</argument>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>-p ${dhp.hadoop.frontend.port.ssh}</argument>
|
||||
<argument>-o StrictHostKeyChecking=no</argument>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>cd ${dhp.hadoop.frontend.temp.dir}/oozie-packages/${sandboxName}/${output.dir.name}/; </argument>
|
||||
<argument>./run_workflow.sh</argument>
|
||||
</arguments>
|
||||
|
@ -512,6 +526,7 @@
|
|||
<configuration>
|
||||
<executable>cat</executable>
|
||||
<arguments>
|
||||
<!--suppress UnresolvedMavenProperty -->
|
||||
<argument>${oozie.execution.log.file.location}</argument>
|
||||
</arguments>
|
||||
</configuration>
|
||||
|
|
Loading…
Reference in New Issue