merged beta
This commit is contained in:
commit
103e2652b3
|
@ -874,9 +874,11 @@ public class MergeUtils {
|
|||
if (toEnrichInstances == null) {
|
||||
return enrichmentResult;
|
||||
}
|
||||
if (enrichmentInstances == null) {
|
||||
return enrichmentResult;
|
||||
|
||||
if (enrichmentInstances == null || enrichmentInstances.isEmpty()) {
|
||||
return toEnrichInstances;
|
||||
}
|
||||
|
||||
Map<String, Instance> ri = toInstanceMap(enrichmentInstances);
|
||||
|
||||
toEnrichInstances.forEach(i -> {
|
||||
|
|
|
@ -61,56 +61,56 @@
|
|||
<artifactId>dhp-graph-mapper</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-graph-provision</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-impact-indicators</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-actionsets</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-hist-snaps</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-monitor-irish</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-promote</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-stats-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-swh</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-raw-data-update</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>eu.dnetlib.dhp</groupId>-->
|
||||
<!-- <artifactId>dhp-usage-stats-build</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- </dependency>-->
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-graph-provision</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-impact-indicators</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-actionsets</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-hist-snaps</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-monitor-irish</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-promote</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-stats-update</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-swh</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-usage-raw-data-update</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-usage-stats-build</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -80,9 +80,11 @@ public class PrepareFOSSparkJob implements Serializable {
|
|||
|
||||
fosDataset
|
||||
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
|
||||
return getResult(ModelSupport.getIdPrefix(Result.class) + "|" + k, it);
|
||||
}, Encoders.bean(Result.class))
|
||||
.mapGroups(
|
||||
(MapGroupsFunction<String, FOSDataModel, Result>) (k,
|
||||
it) -> getResult(
|
||||
ModelSupport.entityIdPrefix.get(Result.class.getSimpleName().toLowerCase()) + "|" + k, it),
|
||||
Encoders.bean(Result.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
|
|
|
@ -18,7 +18,11 @@ import javax.xml.transform.TransformerConfigurationException;
|
|||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.xpath.*;
|
||||
import javax.xml.xpath.XPath;
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import javax.xml.xpath.XPathExpression;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
import javax.xml.xpath.XPathFactory;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -35,7 +39,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
|
|||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
|
||||
/**
|
||||
* log.info(...) equal to log.trace(...) in the application-logs
|
||||
* log.info(...) equal to log.trace(...) in the application-logs
|
||||
* <p>
|
||||
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
|
||||
*
|
||||
|
@ -47,6 +51,7 @@ public class RestIterator implements Iterator<String> {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
|
||||
public static final String UTF_8 = "UTF-8";
|
||||
private static final int MAX_ATTEMPTS = 5;
|
||||
|
||||
private final HttpClientParams clientParams;
|
||||
|
||||
|
@ -60,8 +65,9 @@ public class RestIterator implements Iterator<String> {
|
|||
private final int resultSizeValue;
|
||||
private int resumptionInt = 0; // integer resumption token (first record to harvest)
|
||||
private int resultTotal = -1;
|
||||
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest
|
||||
// or token scanned from results)
|
||||
private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
|
||||
// harvest
|
||||
// or token scanned from results)
|
||||
private InputStream resultStream;
|
||||
private Transformer transformer;
|
||||
private XPath xpath;
|
||||
|
@ -73,7 +79,7 @@ public class RestIterator implements Iterator<String> {
|
|||
private final String querySize;
|
||||
private final String authMethod;
|
||||
private final String authToken;
|
||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>();
|
||||
private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
|
||||
private int discoverResultSize = 0;
|
||||
private int pagination = 1;
|
||||
/*
|
||||
|
@ -83,8 +89,8 @@ public class RestIterator implements Iterator<String> {
|
|||
*/
|
||||
private final String resultOutputFormat;
|
||||
|
||||
/** RestIterator class
|
||||
* compatible to version 1.3.33
|
||||
/**
|
||||
* RestIterator class compatible to version 1.3.33
|
||||
*/
|
||||
public RestIterator(
|
||||
final HttpClientParams clientParams,
|
||||
|
@ -108,40 +114,42 @@ public class RestIterator implements Iterator<String> {
|
|||
this.resumptionType = resumptionType;
|
||||
this.resumptionParam = resumptionParam;
|
||||
this.resultFormatValue = resultFormatValue;
|
||||
this.resultSizeValue = Integer.valueOf(resultSizeValueStr);
|
||||
this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
|
||||
this.queryParams = queryParams;
|
||||
this.authMethod = authMethod;
|
||||
this.authToken = authToken;
|
||||
this.resultOutputFormat = resultOutputFormat;
|
||||
|
||||
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||
this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
|
||||
: "";
|
||||
this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
|
||||
: "";
|
||||
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
|
||||
|
||||
try {
|
||||
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
|
||||
} catch (Exception e) {
|
||||
} catch (final Exception e) {
|
||||
throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
|
||||
}
|
||||
|
||||
initQueue();
|
||||
}
|
||||
|
||||
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath)
|
||||
private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
|
||||
final String entityXpath)
|
||||
throws TransformerConfigurationException, XPathExpressionException {
|
||||
final TransformerFactory factory = TransformerFactory.newInstance();
|
||||
transformer = factory.newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||
xpath = XPathFactory.newInstance().newXPath();
|
||||
xprResultTotalPath = xpath.compile(resultTotalXpath);
|
||||
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||
xprEntity = xpath.compile(entityXpath);
|
||||
this.transformer = factory.newTransformer();
|
||||
this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
|
||||
this.xpath = XPathFactory.newInstance().newXPath();
|
||||
this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
|
||||
this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
|
||||
this.xprEntity = this.xpath.compile(entityXpath);
|
||||
}
|
||||
|
||||
private void initQueue() {
|
||||
query = baseUrl + "?" + queryParams + querySize + queryFormat;
|
||||
log.info("REST calls starting with {}", query);
|
||||
this.query = this.baseUrl + "?" + this.queryParams + this.querySize + this.queryFormat;
|
||||
log.info("REST calls starting with {}", this.query);
|
||||
}
|
||||
|
||||
private void disconnect() {
|
||||
|
@ -154,12 +162,11 @@ public class RestIterator implements Iterator<String> {
|
|||
*/
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (recordQueue.isEmpty() && query.isEmpty()) {
|
||||
if (this.recordQueue.isEmpty() && this.query.isEmpty()) {
|
||||
disconnect();
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -168,214 +175,241 @@ public class RestIterator implements Iterator<String> {
|
|||
*/
|
||||
@Override
|
||||
public String next() {
|
||||
synchronized (recordQueue) {
|
||||
while (recordQueue.isEmpty() && !query.isEmpty()) {
|
||||
synchronized (this.recordQueue) {
|
||||
while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
|
||||
try {
|
||||
query = downloadPage(query);
|
||||
} catch (CollectorException e) {
|
||||
this.query = downloadPage(this.query, 0);
|
||||
} catch (final CollectorException e) {
|
||||
log.debug("CollectorPlugin.next()-Exception: {}", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
return recordQueue.poll();
|
||||
return this.recordQueue.poll();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* download page and return nextQuery
|
||||
* download page and return nextQuery (with number of attempt)
|
||||
*/
|
||||
private String downloadPage(String query) throws CollectorException {
|
||||
String resultJson;
|
||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||
String nextQuery = "";
|
||||
String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
||||
Node resultNode = null;
|
||||
NodeList nodeList = null;
|
||||
String qUrlArgument = "";
|
||||
int urlOldResumptionSize = 0;
|
||||
InputStream theHttpInputStream;
|
||||
private String downloadPage(String query, final int attempt) throws CollectorException {
|
||||
|
||||
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
||||
if (resumptionType.equalsIgnoreCase("deep-cursor")) {
|
||||
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
|
||||
if (!query.contains("&cursor=")) {
|
||||
query += "&cursor=*";
|
||||
if (attempt > MAX_ATTEMPTS) {
|
||||
throw new CollectorException("Max Number of attempts reached, query:" + query);
|
||||
}
|
||||
|
||||
if (attempt > 0) {
|
||||
final int delay = (attempt * 5000);
|
||||
log.debug("Attempt {} with delay {}", attempt, delay);
|
||||
try {
|
||||
Thread.sleep(delay);
|
||||
} catch (final InterruptedException e) {
|
||||
new CollectorException(e);
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
log.info("requestig URL [{}]", query);
|
||||
String resultJson;
|
||||
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
|
||||
String nextQuery = "";
|
||||
final String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
|
||||
Node resultNode = null;
|
||||
NodeList nodeList = null;
|
||||
String qUrlArgument = "";
|
||||
int urlOldResumptionSize = 0;
|
||||
InputStream theHttpInputStream;
|
||||
|
||||
URL qUrl = new URL(query);
|
||||
log.debug("authMethod: {}", authMethod);
|
||||
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: {}", resultXml);
|
||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: {}", resultXml);
|
||||
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
|
||||
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else {
|
||||
theHttpInputStream = qUrl.openStream();
|
||||
}
|
||||
|
||||
resultStream = theHttpInputStream;
|
||||
if ("json".equals(resultOutputFormat)) {
|
||||
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
|
||||
resultXml = JsonUtils.convertToXML(resultJson);
|
||||
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||
}
|
||||
|
||||
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
||||
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
|
||||
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||
log.debug("nodeList.length: {}", nodeList.getLength());
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
StringWriter sw = new StringWriter();
|
||||
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||
String toEnqueue = sw.toString();
|
||||
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
|
||||
log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
|
||||
} else {
|
||||
recordQueue.add(sw.toString());
|
||||
}
|
||||
// check if cursor=* is initial set otherwise add it to the queryParam URL
|
||||
if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
|
||||
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
|
||||
if (!query.contains("&cursor=")) {
|
||||
query += "&cursor=*";
|
||||
}
|
||||
} else {
|
||||
log.warn("resultXml is equal with emptyXml");
|
||||
}
|
||||
|
||||
resumptionInt += resultSizeValue;
|
||||
try {
|
||||
log.info("requesting URL [{}]", query);
|
||||
|
||||
switch (resumptionType.toLowerCase()) {
|
||||
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||
resumptionStr = xprResumptionPath.evaluate(resultNode);
|
||||
break;
|
||||
final URL qUrl = new URL(query);
|
||||
log.debug("authMethod: {}", this.authMethod);
|
||||
if ("bearer".equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: {}", resultXml);
|
||||
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken);
|
||||
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else if (this.BASIC.equalsIgnoreCase(this.authMethod)) {
|
||||
log.trace("authMethod before inputStream: {}", resultXml);
|
||||
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
|
||||
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
|
||||
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
|
||||
conn.setRequestMethod("GET");
|
||||
theHttpInputStream = conn.getInputStream();
|
||||
} else {
|
||||
theHttpInputStream = qUrl.openStream();
|
||||
}
|
||||
|
||||
case "count": // begin at one step for all records, iterate over items
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
break;
|
||||
this.resultStream = theHttpInputStream;
|
||||
if ("json".equals(this.resultOutputFormat)) {
|
||||
resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
|
||||
resultXml = JsonUtils.convertToXML(resultJson);
|
||||
this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
|
||||
}
|
||||
|
||||
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||
if (resultSizeValue < 2) {
|
||||
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
||||
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
|
||||
resultNode = (Node) this.xpath
|
||||
.evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
|
||||
nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
|
||||
log.debug("nodeList.length: {}", nodeList.getLength());
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
|
||||
final String toEnqueue = sw.toString();
|
||||
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue)
|
||||
|| emptyXml.equalsIgnoreCase(toEnqueue)) {
|
||||
log
|
||||
.warn(
|
||||
"The following record resulted in empty item for the feeding queue: {}", resultXml);
|
||||
} else {
|
||||
this.recordQueue.add(sw.toString());
|
||||
}
|
||||
}
|
||||
qUrlArgument = qUrl.getQuery();
|
||||
String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||
for (String arrayUrlArgStr : arrayQUrlArgument) {
|
||||
if (arrayUrlArgStr.startsWith(resumptionParam)) {
|
||||
String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||
if (isInteger(resumptionKeyValue[1])) {
|
||||
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
|
||||
} else {
|
||||
log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
|
||||
} else {
|
||||
log.warn("resultXml is equal with emptyXml");
|
||||
}
|
||||
|
||||
this.resumptionInt += this.resultSizeValue;
|
||||
|
||||
switch (this.resumptionType.toLowerCase()) {
|
||||
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
|
||||
this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
|
||||
break;
|
||||
|
||||
case "count": // begin at one step for all records, iterate over items
|
||||
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||
break;
|
||||
|
||||
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
|
||||
if (this.resultSizeValue < 2) {
|
||||
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
|
||||
}
|
||||
qUrlArgument = qUrl.getQuery();
|
||||
final String[] arrayQUrlArgument = qUrlArgument.split("&");
|
||||
for (final String arrayUrlArgStr : arrayQUrlArgument) {
|
||||
if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
|
||||
final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
|
||||
if (isInteger(resumptionKeyValue[1])) {
|
||||
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
|
||||
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
|
||||
} else {
|
||||
log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (((emptyXml).equalsIgnoreCase(resultXml))
|
||||
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) {
|
||||
// resumptionStr = "";
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
if (((emptyXml).equalsIgnoreCase(resultXml))
|
||||
|| ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
|
||||
// resumptionStr = "";
|
||||
if (nodeList != null) {
|
||||
this.discoverResultSize += nodeList.getLength();
|
||||
}
|
||||
this.resultTotal = this.discoverResultSize;
|
||||
} else {
|
||||
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||
this.resultTotal = this.resumptionInt + 1;
|
||||
if (nodeList != null) {
|
||||
this.discoverResultSize += nodeList.getLength();
|
||||
}
|
||||
}
|
||||
resultTotal = discoverResultSize;
|
||||
} else {
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
resultTotal = resumptionInt + 1;
|
||||
log.info("discoverResultSize: {}", this.discoverResultSize);
|
||||
break;
|
||||
|
||||
case "pagination":
|
||||
case "page": // pagination, iterate over page numbers
|
||||
this.pagination += 1;
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
this.discoverResultSize += nodeList.getLength();
|
||||
} else {
|
||||
this.resultTotal = this.discoverResultSize;
|
||||
this.pagination = this.discoverResultSize;
|
||||
}
|
||||
}
|
||||
log.info("discoverResultSize: {}", discoverResultSize);
|
||||
break;
|
||||
this.resumptionInt = this.pagination;
|
||||
this.resumptionStr = Integer.toString(this.resumptionInt);
|
||||
break;
|
||||
|
||||
case "pagination":
|
||||
case "page": // pagination, iterate over page numbers
|
||||
pagination += 1;
|
||||
if (nodeList != null) {
|
||||
discoverResultSize += nodeList.getLength();
|
||||
} else {
|
||||
resultTotal = discoverResultSize;
|
||||
pagination = discoverResultSize;
|
||||
}
|
||||
resumptionInt = pagination;
|
||||
resumptionStr = Integer.toString(resumptionInt);
|
||||
break;
|
||||
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
|
||||
// in
|
||||
// solr)
|
||||
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
||||
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
||||
|
||||
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in
|
||||
// solr)
|
||||
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
|
||||
// deep-cursor, Param 'resultSizeValue' is less than 2");}
|
||||
this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
|
||||
this.queryParams = this.queryParams.replace("&cursor=*", "");
|
||||
|
||||
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode));
|
||||
queryParams = queryParams.replace("&cursor=*", "");
|
||||
// terminating if length of nodeList is 0
|
||||
if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
|
||||
this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
|
||||
} else {
|
||||
this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
|
||||
// resultSizeValue
|
||||
// because the iteration is over
|
||||
// real length and the
|
||||
// resultSizeValue is added before
|
||||
// the switch()
|
||||
}
|
||||
|
||||
// terminating if length of nodeList is 0
|
||||
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
|
||||
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
|
||||
} else {
|
||||
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
|
||||
// because the iteration is over
|
||||
// real length and the
|
||||
// resultSizeValue is added before
|
||||
// the switch()
|
||||
}
|
||||
this.discoverResultSize = nodeList.getLength();
|
||||
|
||||
discoverResultSize = nodeList.getLength();
|
||||
log
|
||||
.debug(
|
||||
"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
|
||||
+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);
|
||||
|
||||
log
|
||||
.debug(
|
||||
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
|
||||
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
|
||||
break;
|
||||
|
||||
break;
|
||||
default: // otherwise: abort
|
||||
// resultTotal = resumptionInt;
|
||||
break;
|
||||
}
|
||||
|
||||
default: // otherwise: abort
|
||||
// resultTotal = resumptionInt;
|
||||
break;
|
||||
} catch (final Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("collection failed: " + e.getMessage());
|
||||
}
|
||||
|
||||
try {
|
||||
if (resultTotal == -1) {
|
||||
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode));
|
||||
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) {
|
||||
resultTotal += 1;
|
||||
} // to correct the upper bound
|
||||
log.info("resultTotal was -1 is now: " + resultTotal);
|
||||
try {
|
||||
if (this.resultTotal == -1) {
|
||||
this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
|
||||
if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) {
|
||||
this.resultTotal += 1;
|
||||
} // to correct the upper bound
|
||||
log.info("resultTotal was -1 is now: " + this.resultTotal);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
|
||||
log.debug("resultTotal: " + this.resultTotal);
|
||||
log.debug("resInt: " + this.resumptionInt);
|
||||
if (this.resumptionInt <= this.resultTotal) {
|
||||
nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
|
||||
+ this.resumptionStr
|
||||
+ this.queryFormat;
|
||||
} else {
|
||||
nextQuery = "";
|
||||
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
||||
// resumptionInt and prevent a NullPointer Exception at mdStore
|
||||
}
|
||||
log.debug("nextQueryUrl: " + nextQuery);
|
||||
return nextQuery;
|
||||
} catch (final Throwable e) {
|
||||
log.warn(e.getMessage(), e);
|
||||
return downloadPage(query, attempt + 1);
|
||||
}
|
||||
log.debug("resultTotal: " + resultTotal);
|
||||
log.debug("resInt: " + resumptionInt);
|
||||
if (resumptionInt <= resultTotal) {
|
||||
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
|
||||
+ queryFormat;
|
||||
} else {
|
||||
nextQuery = "";
|
||||
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
|
||||
// resumptionInt and prevent a NullPointer Exception at mdStore
|
||||
}
|
||||
log.debug("nextQueryUrl: " + nextQuery);
|
||||
return nextQuery;
|
||||
|
||||
}
|
||||
|
||||
private boolean isInteger(String s) {
|
||||
private boolean isInteger(final String s) {
|
||||
boolean isValidInteger = false;
|
||||
try {
|
||||
Integer.parseInt(s);
|
||||
|
@ -383,7 +417,7 @@ public class RestIterator implements Iterator<String> {
|
|||
// s is a valid integer
|
||||
|
||||
isValidInteger = true;
|
||||
} catch (NumberFormatException ex) {
|
||||
} catch (final NumberFormatException ex) {
|
||||
// s is not an integer
|
||||
}
|
||||
|
||||
|
@ -391,20 +425,20 @@ public class RestIterator implements Iterator<String> {
|
|||
}
|
||||
|
||||
// Method to encode a string value using `UTF-8` encoding scheme
|
||||
private String encodeValue(String value) {
|
||||
private String encodeValue(final String value) {
|
||||
try {
|
||||
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
|
||||
} catch (UnsupportedEncodingException ex) {
|
||||
} catch (final UnsupportedEncodingException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
public String getResultFormatValue() {
|
||||
return resultFormatValue;
|
||||
return this.resultFormatValue;
|
||||
}
|
||||
|
||||
public String getResultOutputFormat() {
|
||||
return resultOutputFormat;
|
||||
return this.resultOutputFormat;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.rest;
|
|||
|
||||
import java.util.HashMap;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
|
@ -69,7 +70,7 @@ public class OsfPreprintCollectorTest {
|
|||
|
||||
@Test
|
||||
@Disabled
|
||||
void test() throws CollectorException {
|
||||
void test_limited() throws CollectorException {
|
||||
final AtomicInteger i = new AtomicInteger(0);
|
||||
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||
|
||||
|
@ -82,4 +83,23 @@ public class OsfPreprintCollectorTest {
|
|||
log.info("{}", i.intValue());
|
||||
Assertions.assertTrue(i.intValue() > 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void test_all() throws CollectorException {
|
||||
final AtomicLong i = new AtomicLong(0);
|
||||
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
|
||||
|
||||
stream.forEach(s -> {
|
||||
Assertions.assertTrue(s.length() > 0);
|
||||
if ((i.incrementAndGet() % 1000) == 0) {
|
||||
log.info("COLLECTED: {}", i.get());
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
log.info("TOTAL: {}", i.get());
|
||||
Assertions.assertTrue(i.get() > 0);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
|||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
|
@ -29,7 +30,10 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
|||
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
|
||||
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
|
||||
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
|
||||
mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
|
||||
|
||||
// commenting out the subject cleaning until we decide if we want to it or not and the implementation will
|
||||
// be completed. At the moment it is not capable of expanding the whole hierarchy.
|
||||
// mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
|
||||
return mapping;
|
||||
}
|
||||
|
||||
|
@ -38,6 +42,13 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
|||
// TODO cleaning based on different subject vocabs can be added here
|
||||
}
|
||||
|
||||
/**
|
||||
* The procedure cleans out the subject values, using a vocabulary identified by the field subject.qualifier.classid.
|
||||
*
|
||||
* @param vocabularyId
|
||||
* @param vocabularies
|
||||
* @param subject
|
||||
*/
|
||||
private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
|
||||
Subject subject) {
|
||||
|
||||
|
@ -49,14 +60,22 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
|||
subject.getQualifier().setClassid(vocabularyId);
|
||||
subject.getQualifier().setClassname(vocabulary.getName());
|
||||
}
|
||||
} else if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
|
||||
Objects.nonNull(subject.getDataInfo()) &&
|
||||
!"subject:fos".equals(subject.getDataInfo().getProvenanceaction())) {
|
||||
Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
|
||||
VocabularyTerm term = vocabulary.getTerm(subject.getValue());
|
||||
if (Objects.isNull(syn) && Objects.isNull(term)) {
|
||||
subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD);
|
||||
subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD);
|
||||
} else {
|
||||
final String provenanceActionClassId = Optional
|
||||
.ofNullable(subject.getDataInfo())
|
||||
.map(DataInfo::getProvenanceaction)
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(null);
|
||||
|
||||
if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
|
||||
!"subject:fos".equals(provenanceActionClassId)) {
|
||||
|
||||
Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
|
||||
VocabularyTerm term = vocabulary.getTerm(subject.getValue());
|
||||
if (Objects.isNull(syn) && Objects.isNull(term)) {
|
||||
subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD);
|
||||
subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
|
|
@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
|
|||
result
|
||||
.getTitle()
|
||||
.stream()
|
||||
.filter(t -> StringUtils.isNotBlank(t.getValue()))
|
||||
.findFirst()
|
||||
.map(StructuredProperty::getValue)
|
||||
.ifPresent(
|
||||
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH)));
|
||||
title -> {
|
||||
re.setTitle(title);
|
||||
re
|
||||
.getTitle()
|
||||
.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||
});
|
||||
}
|
||||
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
|
||||
result
|
||||
|
|
|
@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
|
||||
import static org.apache.spark.sql.functions.*;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.SparkContext;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.expressions.UserDefinedFunction;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -45,9 +37,9 @@ import scala.Tuple2;
|
|||
/**
|
||||
* XmlConverterJob converts the JoinedEntities as XML records
|
||||
*/
|
||||
public class XmlConverterJob {
|
||||
public class PayloadConverterJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class);
|
||||
|
||||
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
||||
|
||||
|
@ -56,8 +48,8 @@ public class XmlConverterJob {
|
|||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
XmlConverterJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
|
||||
PayloadConverterJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -72,6 +64,12 @@ public class XmlConverterJob {
|
|||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
final Boolean validateXML = Optional
|
||||
.ofNullable(parser.get("validateXML"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.FALSE);
|
||||
log.info("validateXML: {}", validateXML);
|
||||
|
||||
final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
|
||||
log.info("contextApiBaseUrl: {}", contextApiBaseUrl);
|
||||
|
||||
|
@ -86,18 +84,19 @@ public class XmlConverterJob {
|
|||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
convertToXml(
|
||||
createPayloads(
|
||||
spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
|
||||
VocabularyGroup.loadVocsFromIS(isLookup));
|
||||
VocabularyGroup.loadVocsFromIS(isLookup), validateXML);
|
||||
});
|
||||
}
|
||||
|
||||
private static void convertToXml(
|
||||
private static void createPayloads(
|
||||
final SparkSession spark,
|
||||
final String inputPath,
|
||||
final String outputPath,
|
||||
final ContextMapper contextMapper,
|
||||
final VocabularyGroup vocabularies) {
|
||||
final VocabularyGroup vocabularies,
|
||||
final Boolean validateXML) {
|
||||
|
||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(
|
||||
prepareAccumulators(spark.sparkContext()),
|
||||
|
@ -118,7 +117,7 @@ public class XmlConverterJob {
|
|||
.as(Encoders.kryo(JoinedEntity.class))
|
||||
.map(
|
||||
(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
|
||||
recordFactory.build(je),
|
||||
recordFactory.build(je, validateXML),
|
||||
ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
|
||||
.map(
|
|
@ -2,42 +2,34 @@
|
|||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static org.apache.spark.sql.functions.col;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
import org.apache.spark.sql.expressions.Window;
|
||||
import org.apache.spark.sql.expressions.WindowSpec;
|
||||
import org.apache.spark.sql.functions;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
||||
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
|
||||
|
@ -130,132 +122,36 @@ public class PrepareRelationsJob {
|
|||
private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
|
||||
Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
|
||||
|
||||
JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath)
|
||||
.filter(rel -> !(rel.getSource().startsWith("unresolved") || rel.getTarget().startsWith("unresolved")))
|
||||
.filter(rel -> !rel.getDataInfo().getDeletedbyinference())
|
||||
.filter(rel -> !relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())));
|
||||
WindowSpec source_w = Window
|
||||
.partitionBy("source", "subRelType")
|
||||
.orderBy(col("target").desc_nulls_last());
|
||||
|
||||
JavaRDD<Relation> pruned = pruneRels(
|
||||
pruneRels(
|
||||
rels,
|
||||
sourceMaxRelations, relPartitions, (Function<Relation, String>) Relation::getSource),
|
||||
targetMaxRelations, relPartitions, (Function<Relation, String>) Relation::getTarget);
|
||||
spark
|
||||
.createDataset(pruned.rdd(), Encoders.bean(Relation.class))
|
||||
.repartition(relPartitions)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.parquet(outputPath);
|
||||
}
|
||||
WindowSpec target_w = Window
|
||||
.partitionBy("target", "subRelType")
|
||||
.orderBy(col("source").desc_nulls_last());
|
||||
|
||||
private static JavaRDD<Relation> pruneRels(JavaRDD<Relation> rels, int maxRelations,
|
||||
int relPartitions, Function<Relation, String> idFn) {
|
||||
return rels
|
||||
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r))
|
||||
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
|
||||
.groupBy(Tuple2::_1)
|
||||
.map(Tuple2::_2)
|
||||
.map(t -> Iterables.limit(t, maxRelations))
|
||||
.flatMap(Iterable::iterator)
|
||||
.map(Tuple2::_2);
|
||||
}
|
||||
|
||||
// experimental
|
||||
private static void prepareRelationsDataset(
|
||||
SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int maxRelations,
|
||||
int relPartitions) {
|
||||
spark
|
||||
.read()
|
||||
.textFile(inputRelationsPath)
|
||||
.repartition(relPartitions)
|
||||
.map(
|
||||
(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
|
||||
Encoders.kryo(Relation.class))
|
||||
.filter((FilterFunction<Relation>) rel -> !rel.getDataInfo().getDeletedbyinference())
|
||||
.filter((FilterFunction<Relation>) rel -> !relationFilter.contains(rel.getRelClass()))
|
||||
.groupByKey(
|
||||
(MapFunction<Relation, String>) Relation::getSource,
|
||||
Encoders.STRING())
|
||||
.agg(new RelationAggregator(maxRelations).toColumn())
|
||||
.flatMap(
|
||||
(FlatMapFunction<Tuple2<String, RelationList>, Relation>) t -> Iterables
|
||||
.limit(t._2().getRelations(), maxRelations)
|
||||
.iterator(),
|
||||
Encoders.bean(Relation.class))
|
||||
.repartition(relPartitions)
|
||||
.schema(Encoders.bean(Relation.class).schema())
|
||||
.json(inputRelationsPath)
|
||||
.where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'")
|
||||
.where("datainfo.deletedbyinference != true")
|
||||
.where(
|
||||
relationFilter.isEmpty() ? ""
|
||||
: "lower(relClass) NOT IN ("
|
||||
+ relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
|
||||
.withColumn("source_w_pos", functions.row_number().over(source_w))
|
||||
.where("source_w_pos < " + sourceMaxRelations)
|
||||
.drop("source_w_pos")
|
||||
.withColumn("target_w_pos", functions.row_number().over(target_w))
|
||||
.where("target_w_pos < " + targetMaxRelations)
|
||||
.drop("target_w_pos")
|
||||
.coalesce(relPartitions)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.parquet(outputPath);
|
||||
}
|
||||
|
||||
public static class RelationAggregator
|
||||
extends Aggregator<Relation, RelationList, RelationList> {
|
||||
|
||||
private final int maxRelations;
|
||||
|
||||
public RelationAggregator(int maxRelations) {
|
||||
this.maxRelations = maxRelations;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RelationList zero() {
|
||||
return new RelationList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public RelationList reduce(RelationList b, Relation a) {
|
||||
b.getRelations().add(a);
|
||||
return getSortableRelationList(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public RelationList merge(RelationList b1, RelationList b2) {
|
||||
b1.getRelations().addAll(b2.getRelations());
|
||||
return getSortableRelationList(b1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public RelationList finish(RelationList r) {
|
||||
return getSortableRelationList(r);
|
||||
}
|
||||
|
||||
private RelationList getSortableRelationList(RelationList b1) {
|
||||
RelationList sr = new RelationList();
|
||||
sr
|
||||
.setRelations(
|
||||
b1
|
||||
.getRelations()
|
||||
.stream()
|
||||
.limit(maxRelations)
|
||||
.collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator()))));
|
||||
return sr;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<RelationList> bufferEncoder() {
|
||||
return Encoders.kryo(RelationList.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<RelationList> outputEncoder() {
|
||||
return Encoders.kryo(RelationList.class);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
|
||||
* file,
|
||||
*
|
||||
* @param spark
|
||||
* @param inputPath
|
||||
* @return the JavaRDD<SortableRelation> containing all the relationships
|
||||
*/
|
||||
private static JavaRDD<Relation> readPathRelationRDD(
|
||||
SparkSession spark, final String inputPath) {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class));
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
|
|
@ -1,44 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class RelationComparator implements Comparator<Relation> {
|
||||
|
||||
private static final Map<String, Integer> weights = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
weights.put(ModelConstants.OUTCOME, 0);
|
||||
weights.put(ModelConstants.SUPPLEMENT, 1);
|
||||
weights.put(ModelConstants.REVIEW, 2);
|
||||
weights.put(ModelConstants.CITATION, 3);
|
||||
weights.put(ModelConstants.AFFILIATION, 4);
|
||||
weights.put(ModelConstants.RELATIONSHIP, 5);
|
||||
weights.put(ModelConstants.PUBLICATION_DATASET, 6);
|
||||
weights.put(ModelConstants.SIMILARITY, 7);
|
||||
|
||||
weights.put(ModelConstants.PROVISION, 8);
|
||||
weights.put(ModelConstants.PARTICIPATION, 9);
|
||||
weights.put(ModelConstants.DEDUP, 10);
|
||||
}
|
||||
|
||||
private Integer getWeight(Relation o) {
|
||||
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(Relation o1, Relation o2) {
|
||||
return ComparisonChain
|
||||
.start()
|
||||
.compare(getWeight(o1), getWeight(o2))
|
||||
.result();
|
||||
}
|
||||
}
|
|
@ -1,25 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Queue;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class RelationList implements Serializable {
|
||||
|
||||
private Queue<Relation> relations;
|
||||
|
||||
public RelationList() {
|
||||
this.relations = new PriorityQueue<>(new RelationComparator());
|
||||
}
|
||||
|
||||
public Queue<Relation> getRelations() {
|
||||
return relations;
|
||||
}
|
||||
|
||||
public void setRelations(Queue<Relation> relations) {
|
||||
this.relations = relations;
|
||||
}
|
||||
}
|
|
@ -1,81 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class SortableRelation extends Relation implements Comparable<SortableRelation>, Serializable {
|
||||
|
||||
private static final Map<String, Integer> weights = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
weights.put(ModelConstants.OUTCOME, 0);
|
||||
weights.put(ModelConstants.SUPPLEMENT, 1);
|
||||
weights.put(ModelConstants.REVIEW, 2);
|
||||
weights.put(ModelConstants.CITATION, 3);
|
||||
weights.put(ModelConstants.AFFILIATION, 4);
|
||||
weights.put(ModelConstants.RELATIONSHIP, 5);
|
||||
weights.put(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, 6);
|
||||
weights.put(ModelConstants.SIMILARITY, 7);
|
||||
|
||||
weights.put(ModelConstants.PROVISION, 8);
|
||||
weights.put(ModelConstants.PARTICIPATION, 9);
|
||||
weights.put(ModelConstants.DEDUP, 10);
|
||||
}
|
||||
|
||||
private static final long serialVersionUID = 34753984579L;
|
||||
|
||||
private String groupingKey;
|
||||
|
||||
public static SortableRelation create(Relation r, String groupingKey) {
|
||||
SortableRelation sr = new SortableRelation();
|
||||
sr.setGroupingKey(groupingKey);
|
||||
sr.setSource(r.getSource());
|
||||
sr.setTarget(r.getTarget());
|
||||
sr.setRelType(r.getRelType());
|
||||
sr.setSubRelType(r.getSubRelType());
|
||||
sr.setRelClass(r.getRelClass());
|
||||
sr.setDataInfo(r.getDataInfo());
|
||||
sr.setCollectedfrom(r.getCollectedfrom());
|
||||
sr.setLastupdatetimestamp(r.getLastupdatetimestamp());
|
||||
sr.setProperties(r.getProperties());
|
||||
sr.setValidated(r.getValidated());
|
||||
sr.setValidationDate(r.getValidationDate());
|
||||
|
||||
return sr;
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public Relation asRelation() {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(SortableRelation o) {
|
||||
return ComparisonChain
|
||||
.start()
|
||||
.compare(getGroupingKey(), o.getGroupingKey())
|
||||
.compare(getWeight(this), getWeight(o))
|
||||
.result();
|
||||
}
|
||||
|
||||
private Integer getWeight(SortableRelation o) {
|
||||
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
public String getGroupingKey() {
|
||||
return groupingKey;
|
||||
}
|
||||
|
||||
public void setGroupingKey(String groupingKey) {
|
||||
this.groupingKey = groupingKey;
|
||||
}
|
||||
}
|
|
@ -1,8 +1,6 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision.model;
|
||||
|
||||
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -16,16 +14,15 @@ import org.jetbrains.annotations.Nullable;
|
|||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
|
||||
import eu.dnetlib.dhp.oa.provision.RelationList;
|
||||
import eu.dnetlib.dhp.oa.provision.SortableRelation;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.solr.*;
|
||||
import eu.dnetlib.dhp.schema.solr.AccessRight;
|
||||
import eu.dnetlib.dhp.schema.solr.Author;
|
||||
|
@ -55,10 +52,7 @@ public class ProvisionModelSupport {
|
|||
.newArrayList(
|
||||
RelatedEntityWrapper.class,
|
||||
JoinedEntity.class,
|
||||
RelatedEntity.class,
|
||||
SortableRelationKey.class,
|
||||
SortableRelation.class,
|
||||
RelationList.class));
|
||||
RelatedEntity.class));
|
||||
return modelClasses.toArray(new Class[] {});
|
||||
}
|
||||
|
||||
|
@ -74,7 +68,11 @@ public class ProvisionModelSupport {
|
|||
.setHeader(
|
||||
SolrRecordHeader
|
||||
.newInstance(
|
||||
e.getId(), e.getOriginalId(), type, deletedbyinference));
|
||||
StringUtils
|
||||
.substringAfter(
|
||||
e.getId(),
|
||||
IdentifierFactory.ID_PREFIX_SEPARATOR),
|
||||
e.getOriginalId(), type, deletedbyinference));
|
||||
r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
|
||||
r.setContext(asContext(e.getContext(), contextMapper));
|
||||
r.setPid(asPid(e.getPid()));
|
||||
|
@ -114,7 +112,8 @@ public class ProvisionModelSupport {
|
|||
.newInstance(
|
||||
relation.getRelType(),
|
||||
relation.getRelClass(),
|
||||
relation.getTarget(), relatedRecordType));
|
||||
StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR),
|
||||
relatedRecordType));
|
||||
|
||||
rr.setAcronym(re.getAcronym());
|
||||
rr.setCode(re.getCode());
|
||||
|
@ -147,6 +146,7 @@ public class ProvisionModelSupport {
|
|||
ps.setContracttype(mapCodeLabel(p.getContracttype()));
|
||||
ps.setCurrency(mapField(p.getCurrency()));
|
||||
ps.setDuration(mapField(p.getDuration()));
|
||||
ps.setOamandatepublications(mapField(p.getOamandatepublications()));
|
||||
ps.setCallidentifier(mapField(p.getCallidentifier()));
|
||||
ps.setEcarticle29_3(mapField(p.getEcarticle29_3()));
|
||||
ps.setEnddate(mapField(p.getEnddate()));
|
||||
|
@ -387,7 +387,7 @@ public class ProvisionModelSupport {
|
|||
.equals(
|
||||
Optional
|
||||
.ofNullable(t.getQualifier())
|
||||
.map(Qualifier::getClassid)
|
||||
.map(Qualifier::getClassname)
|
||||
.orElse(null)))
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toList()))
|
||||
|
@ -405,7 +405,7 @@ public class ProvisionModelSupport {
|
|||
.equals(
|
||||
Optional
|
||||
.ofNullable(t.getQualifier())
|
||||
.map(Qualifier::getClassid)
|
||||
.map(Qualifier::getClassname)
|
||||
.orElse(null)))
|
||||
.map(StructuredProperty::getValue)
|
||||
.findFirst())
|
||||
|
@ -472,7 +472,7 @@ public class ProvisionModelSupport {
|
|||
}
|
||||
|
||||
private static String mapQualifier(eu.dnetlib.dhp.schema.oaf.Qualifier q) {
|
||||
return Optional.ofNullable(q).map(Qualifier::getClassid).orElse(null);
|
||||
return Optional.ofNullable(q).map(Qualifier::getClassname).orElse(null);
|
||||
}
|
||||
|
||||
private static Journal mapJournal(eu.dnetlib.dhp.schema.oaf.Journal joaf) {
|
||||
|
@ -581,7 +581,7 @@ public class ProvisionModelSupport {
|
|||
.map(
|
||||
pids -> pids
|
||||
.stream()
|
||||
.map(p -> Pid.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||
.map(p -> Pid.newInstance(p.getQualifier().getClassname(), p.getValue()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
@ -606,8 +606,8 @@ public class ProvisionModelSupport {
|
|||
subjects -> subjects
|
||||
.stream()
|
||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||
.filter(s -> Objects.nonNull(s.getQualifier().getClassid()))
|
||||
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassid()))
|
||||
.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
|
||||
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
@ -619,8 +619,8 @@ public class ProvisionModelSupport {
|
|||
subjects -> subjects
|
||||
.stream()
|
||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||
.filter(s -> Objects.nonNull(s.getQualifier().getClassid()))
|
||||
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassid()))
|
||||
.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
|
||||
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
|
|
@ -93,10 +93,13 @@ public class XmlRecordFactory implements Serializable {
|
|||
}
|
||||
|
||||
public String build(final JoinedEntity je) {
|
||||
return build(je, false);
|
||||
}
|
||||
|
||||
public String build(final JoinedEntity je, final Boolean validate) {
|
||||
|
||||
final Set<String> contexts = Sets.newHashSet();
|
||||
|
||||
// final OafEntity entity = toOafEntity(je.getEntity());
|
||||
final OafEntity entity = je.getEntity();
|
||||
final TemplateFactory templateFactory = new TemplateFactory();
|
||||
try {
|
||||
|
@ -122,7 +125,13 @@ public class XmlRecordFactory implements Serializable {
|
|||
.buildBody(
|
||||
mainType, metadata, relations, listChildren(entity, je, templateFactory), listExtraInfo(entity));
|
||||
|
||||
return templateFactory.buildRecord(entity, schemaLocation, body);
|
||||
String xmlRecord = templateFactory.buildRecord(entity, schemaLocation, body);
|
||||
|
||||
if (Boolean.TRUE.equals(validate)) {
|
||||
// rise an exception when an invalid record was built
|
||||
new SAXReader().read(new StringReader(xmlRecord));
|
||||
}
|
||||
return xmlRecord;
|
||||
// return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent);
|
||||
} catch (final Throwable e) {
|
||||
throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e);
|
||||
|
@ -1038,13 +1047,21 @@ public class XmlRecordFactory implements Serializable {
|
|||
}
|
||||
|
||||
private List<String> measuresAsXml(List<Measure> measures) {
|
||||
return measures
|
||||
.stream()
|
||||
.map(m -> {
|
||||
List<Tuple2<String, String>> l = Lists.newArrayList(new Tuple2<>("id", m.getId()));
|
||||
m.getUnit().forEach(kv -> l.add(new Tuple2<>(kv.getKey(), kv.getValue())));
|
||||
return XmlSerializationUtils.asXmlElement("measure", l);
|
||||
})
|
||||
return Stream
|
||||
.concat(
|
||||
measures
|
||||
.stream()
|
||||
.filter(m -> !"downloads".equals(m.getId()) && !"views".equals(m.getId()))
|
||||
.map(m -> {
|
||||
List<Tuple2<String, String>> l = Lists.newArrayList(new Tuple2<>("id", m.getId()));
|
||||
m.getUnit().forEach(kv -> l.add(new Tuple2<>(kv.getKey(), kv.getValue())));
|
||||
return XmlSerializationUtils.asXmlElement("measure", l);
|
||||
}),
|
||||
measures
|
||||
.stream()
|
||||
.filter(m -> "downloads".equals(m.getId()) || "views".equals(m.getId()))
|
||||
.filter(m -> m.getUnit().stream().anyMatch(u -> Integer.parseInt(u.getValue()) > 0))
|
||||
.map(m -> XmlSerializationUtils.usageMeasureAsXmlElement("measure", m)))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
|
|||
import static org.apache.commons.lang3.StringUtils.isBlank;
|
||||
import static org.apache.commons.lang3.StringUtils.isNotBlank;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
|
@ -166,6 +170,33 @@ public class XmlSerializationUtils {
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
// <measure downloads="0" views="0">infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE</measure>
|
||||
public static String usageMeasureAsXmlElement(String name, Measure measure) {
|
||||
HashSet<String> dsIds = Optional
|
||||
.ofNullable(measure.getUnit())
|
||||
.map(
|
||||
m -> m
|
||||
.stream()
|
||||
.map(KeyValue::getKey)
|
||||
.collect(Collectors.toCollection(HashSet::new)))
|
||||
.orElse(new HashSet<>());
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
dsIds.forEach(dsId -> {
|
||||
sb
|
||||
.append("<")
|
||||
.append(name);
|
||||
for (KeyValue kv : measure.getUnit()) {
|
||||
sb.append(" ").append(attr(measure.getId(), kv.getValue()));
|
||||
}
|
||||
sb
|
||||
.append(" ")
|
||||
.append(attr("datasource", dsId))
|
||||
.append("/>");
|
||||
});
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static String mapEoscIf(EoscIfGuidelines e) {
|
||||
return asXmlElement(
|
||||
"eoscifguidelines", Lists
|
||||
|
|
|
@ -22,5 +22,11 @@
|
|||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "URL of the context ISLookup Service",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "val",
|
||||
"paramLongName": "validateXML",
|
||||
"paramDescription": "should the process check the XML validity",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -13,6 +13,11 @@
|
|||
<name>contextApiBaseUrl</name>
|
||||
<description>context API URL</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>validateXML</name>
|
||||
<description>should the payload converter validate the XMLs</description>
|
||||
<value>false</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>relPartitions</name>
|
||||
<description>number or partitions for the relations Dataset</description>
|
||||
|
@ -125,7 +130,7 @@
|
|||
<case to="prepare_relations">${wf:conf('resumeFrom') eq 'prepare_relations'}</case>
|
||||
<case to="fork_join_related_entities">${wf:conf('resumeFrom') eq 'fork_join_related_entities'}</case>
|
||||
<case to="fork_join_all_entities">${wf:conf('resumeFrom') eq 'fork_join_all_entities'}</case>
|
||||
<case to="convert_to_xml">${wf:conf('resumeFrom') eq 'convert_to_xml'}</case>
|
||||
<case to="create_payloads">${wf:conf('resumeFrom') eq 'create_payloads'}</case>
|
||||
<case to="drop_solr_collection">${wf:conf('resumeFrom') eq 'drop_solr_collection'}</case>
|
||||
<case to="to_solr_index">${wf:conf('resumeFrom') eq 'to_solr_index'}</case>
|
||||
<default to="prepare_relations"/>
|
||||
|
@ -144,21 +149,23 @@
|
|||
<class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCoresForJoining}
|
||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
||||
--executor-cores=4
|
||||
--executor-memory=6G
|
||||
--driver-memory=${sparkDriverMemoryForJoining}
|
||||
--conf spark.executor.memoryOverhead=6G
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=15000
|
||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
||||
</spark-opts>
|
||||
<arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
|
||||
<arg>--sourceMaxRelations</arg><arg>${sourceMaxRelations}</arg>
|
||||
<arg>--targetMaxRelations</arg><arg>${targetMaxRelations}</arg>
|
||||
<arg>--relationFilter</arg><arg>${relationFilter}</arg>
|
||||
<arg>--relPartitions</arg><arg>5000</arg>
|
||||
<arg>--relPartitions</arg><arg>15000</arg>
|
||||
</spark>
|
||||
<ok to="fork_join_related_entities"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -585,19 +592,20 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_join_phase2" to="convert_to_xml"/>
|
||||
<join name="wait_join_phase2" to="create_payloads"/>
|
||||
|
||||
<action name="convert_to_xml">
|
||||
<action name="create_payloads">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>convert_to_xml</name>
|
||||
<class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class>
|
||||
<name>create_payloads</name>
|
||||
<class>eu.dnetlib.dhp.oa.provision.PayloadConverterJob</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -607,6 +615,7 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/join_entities</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/xml_json</arg>
|
||||
<arg>--validateXML</arg><arg>${validateXML}</arg>
|
||||
<arg>--contextApiBaseUrl</arg><arg>${contextApiBaseUrl}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
|
|
|
@ -50,7 +50,7 @@ public class EOSCFuture_Test {
|
|||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final OtherResearchProduct p = OBJECT_MAPPER
|
||||
.readValue(
|
||||
|
|
|
@ -57,7 +57,7 @@ public class IndexRecordTransformerTest {
|
|||
public void testPublicationRecordTransformation() throws IOException, TransformerException {
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Publication p = load("publication.json", Publication.class);
|
||||
final Project pj = load("project.json", Project.class);
|
||||
|
@ -82,7 +82,7 @@ public class IndexRecordTransformerTest {
|
|||
void testPeerReviewed() throws IOException, TransformerException {
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Publication p = load("publication.json", Publication.class);
|
||||
|
||||
|
@ -98,7 +98,7 @@ public class IndexRecordTransformerTest {
|
|||
public void testRiunet() throws IOException, TransformerException {
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Publication p = load("riunet.json", Publication.class);
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ public class XmlRecordFactoryTest {
|
|||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Publication p = OBJECT_MAPPER
|
||||
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
|
||||
|
@ -105,7 +105,7 @@ public class XmlRecordFactoryTest {
|
|||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Publication p = OBJECT_MAPPER
|
||||
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
|
||||
|
@ -136,7 +136,7 @@ public class XmlRecordFactoryTest {
|
|||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Publication p = OBJECT_MAPPER
|
||||
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
|
||||
|
@ -166,7 +166,7 @@ public class XmlRecordFactoryTest {
|
|||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Datasource d = OBJECT_MAPPER
|
||||
.readValue(IOUtils.toString(getClass().getResourceAsStream("datasource.json")), Datasource.class);
|
||||
|
@ -203,7 +203,7 @@ public class XmlRecordFactoryTest {
|
|||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final OtherResearchProduct p = OBJECT_MAPPER
|
||||
.readValue(
|
||||
|
@ -226,7 +226,7 @@ public class XmlRecordFactoryTest {
|
|||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final OtherResearchProduct p = OBJECT_MAPPER
|
||||
.readValue(
|
||||
|
@ -249,7 +249,7 @@ public class XmlRecordFactoryTest {
|
|||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Publication p = OBJECT_MAPPER
|
||||
.readValue(
|
||||
|
|
|
@ -67,24 +67,21 @@ function copydb() {
|
|||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||
rm -f error.log
|
||||
return 1
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Make Impala aware of the deletion of the old DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
|
||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||
# Using max memory of: 50 * 6144 = 300 Gb
|
||||
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
|
||||
# Using max memory of: 70 * 6144 = 430 Gb
|
||||
# Using 1MB as a buffer-size.
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
|
||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||
-numListstatusThreads 40 \
|
||||
-copybuffersize 1048576 \
|
||||
-strategy dynamic \
|
||||
-blocksperchunk 8 \
|
||||
-pb \
|
||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||
|
||||
|
@ -92,9 +89,9 @@ function copydb() {
|
|||
if [ $? -eq 0 ]; then
|
||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||
rm -f error.log
|
||||
return 2
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||
|
@ -105,14 +102,11 @@ function copydb() {
|
|||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Make Impala aware of the creation of the new DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
||||
all_create_view_statements=()
|
||||
num_tables=0
|
||||
|
||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
|
@ -129,9 +123,11 @@ function copydb() {
|
|||
all_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||
((num_tables++))
|
||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
|
@ -142,74 +138,73 @@ function copydb() {
|
|||
fi
|
||||
done
|
||||
|
||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements[@]}
|
||||
if [[ $num_tables -gt 0 ]]; then
|
||||
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
|
||||
else
|
||||
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
|
||||
fi
|
||||
|
||||
# Time to loop through the views and create them.
|
||||
# At this point all table-schemas should have been created.
|
||||
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
else
|
||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||
fi
|
||||
|
||||
level_counter=0
|
||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||
while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
|
||||
((level_counter++))
|
||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||
# In this case, we should retry creating this particular view again.
|
||||
should_retry_create_view_statements=()
|
||||
new_num_of_views_to_retry=0
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||
should_retry_create_view_statements+=("$create_view_statement")
|
||||
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
|
||||
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
|
||||
else
|
||||
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
|
||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||
fi
|
||||
done
|
||||
|
||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||
all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
|
||||
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
|
||||
|
||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||
return 3
|
||||
exit 5
|
||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||
else
|
||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||
fi
|
||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
done
|
||||
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
|
||||
echo -e "\nComputing stats for tables..\n"
|
||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||
fi
|
||||
done
|
||||
|
||||
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
|
||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||
rm -f error.log
|
||||
return 4
|
||||
exit 6
|
||||
fi
|
||||
|
||||
rm -f error.log
|
||||
|
|
|
@ -66,24 +66,21 @@ function copydb() {
|
|||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||
rm -f error.log
|
||||
return 1
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Make Impala aware of the deletion of the old DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
|
||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||
# Using max memory of: 50 * 6144 = 300 Gb
|
||||
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
|
||||
# Using max memory of: 70 * 6144 = 430 Gb
|
||||
# Using 1MB as a buffer-size.
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
|
||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||
-numListstatusThreads 40 \
|
||||
-copybuffersize 1048576 \
|
||||
-strategy dynamic \
|
||||
-blocksperchunk 8 \
|
||||
-pb \
|
||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||
|
||||
|
@ -91,9 +88,9 @@ function copydb() {
|
|||
if [ $? -eq 0 ]; then
|
||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||
rm -f error.log
|
||||
return 2
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||
|
@ -104,14 +101,11 @@ function copydb() {
|
|||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Make Impala aware of the creation of the new DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
||||
all_create_view_statements=()
|
||||
num_tables=0
|
||||
|
||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
|
@ -128,9 +122,11 @@ function copydb() {
|
|||
all_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||
((num_tables++))
|
||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
|
@ -141,74 +137,73 @@ function copydb() {
|
|||
fi
|
||||
done
|
||||
|
||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements[@]}
|
||||
if [[ $num_tables -gt 0 ]]; then
|
||||
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
|
||||
else
|
||||
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
|
||||
fi
|
||||
|
||||
# Time to loop through the views and create them.
|
||||
# At this point all table-schemas should have been created.
|
||||
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
else
|
||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||
fi
|
||||
|
||||
level_counter=0
|
||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||
while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
|
||||
((level_counter++))
|
||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||
# In this case, we should retry creating this particular view again.
|
||||
should_retry_create_view_statements=()
|
||||
new_num_of_views_to_retry=0
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||
should_retry_create_view_statements+=("$create_view_statement")
|
||||
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
|
||||
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
|
||||
else
|
||||
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
|
||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||
fi
|
||||
done
|
||||
|
||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||
all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
|
||||
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
|
||||
|
||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||
return 3
|
||||
exit 5
|
||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||
else
|
||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||
fi
|
||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
done
|
||||
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
|
||||
echo -e "\nComputing stats for tables..\n"
|
||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||
fi
|
||||
done
|
||||
|
||||
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
|
||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||
rm -f error.log
|
||||
return 4
|
||||
exit 6
|
||||
fi
|
||||
|
||||
rm -f error.log
|
||||
|
|
|
@ -66,24 +66,21 @@ function copydb() {
|
|||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||
rm -f error.log
|
||||
return 1
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Make Impala aware of the deletion of the old DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
|
||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||
# Using max memory of: 50 * 6144 = 300 Gb
|
||||
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
|
||||
# Using max memory of: 70 * 6144 = 430 Gb
|
||||
# Using 1MB as a buffer-size.
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
|
||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||
-numListstatusThreads 40 \
|
||||
-copybuffersize 1048576 \
|
||||
-strategy dynamic \
|
||||
-blocksperchunk 8 \
|
||||
-pb \
|
||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||
|
||||
|
@ -91,9 +88,9 @@ function copydb() {
|
|||
if [ $? -eq 0 ]; then
|
||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||
rm -f error.log
|
||||
return 2
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||
|
@ -104,14 +101,11 @@ function copydb() {
|
|||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Make Impala aware of the creation of the new DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
||||
all_create_view_statements=()
|
||||
num_tables=0
|
||||
|
||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
|
@ -128,9 +122,11 @@ function copydb() {
|
|||
all_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||
((num_tables++))
|
||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
|
@ -141,74 +137,73 @@ function copydb() {
|
|||
fi
|
||||
done
|
||||
|
||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements[@]}
|
||||
if [[ $num_tables -gt 0 ]]; then
|
||||
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
|
||||
else
|
||||
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
|
||||
fi
|
||||
|
||||
# Time to loop through the views and create them.
|
||||
# At this point all table-schemas should have been created.
|
||||
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
else
|
||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||
fi
|
||||
|
||||
level_counter=0
|
||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||
while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
|
||||
((level_counter++))
|
||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||
# In this case, we should retry creating this particular view again.
|
||||
should_retry_create_view_statements=()
|
||||
new_num_of_views_to_retry=0
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||
should_retry_create_view_statements+=("$create_view_statement")
|
||||
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
|
||||
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
|
||||
else
|
||||
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
|
||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||
fi
|
||||
done
|
||||
|
||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||
all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
|
||||
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
|
||||
|
||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||
return 3
|
||||
exit 5
|
||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||
else
|
||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||
fi
|
||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
done
|
||||
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
|
||||
echo -e "\nComputing stats for tables..\n"
|
||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||
fi
|
||||
done
|
||||
|
||||
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
|
||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||
rm -f error.log
|
||||
return 4
|
||||
exit 6
|
||||
fi
|
||||
|
||||
rm -f error.log
|
||||
|
|
|
@ -68,24 +68,21 @@ function copydb() {
|
|||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
|
||||
rm -f error.log
|
||||
return 1
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Make Impala aware of the deletion of the old DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
|
||||
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
|
||||
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s
|
||||
# Using max memory of: 50 * 6144 = 300 Gb
|
||||
# Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
|
||||
# Using max memory of: 70 * 6144 = 430 Gb
|
||||
# Using 1MB as a buffer-size.
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop
|
||||
# The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
|
||||
# The "ug" args cannot be used as we get a "User does not belong to hive" error.
|
||||
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
|
||||
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
|
||||
-numListstatusThreads 40 \
|
||||
-copybuffersize 1048576 \
|
||||
-strategy dynamic \
|
||||
-blocksperchunk 8 \
|
||||
-pb \
|
||||
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
|
||||
|
||||
|
@ -93,9 +90,9 @@ function copydb() {
|
|||
if [ $? -eq 0 ]; then
|
||||
echo -e "\nSuccessfully copied the files of '${db}'.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n"
|
||||
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
|
||||
rm -f error.log
|
||||
return 2
|
||||
exit 3
|
||||
fi
|
||||
|
||||
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
|
||||
|
@ -106,14 +103,11 @@ function copydb() {
|
|||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Make Impala aware of the creation of the new DB immediately.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
||||
all_create_view_statements=()
|
||||
num_tables=0
|
||||
|
||||
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
|
||||
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
|
@ -130,9 +124,11 @@ function copydb() {
|
|||
all_create_view_statements+=("$create_view_statement")
|
||||
else
|
||||
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
|
||||
((num_tables++))
|
||||
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
|
||||
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
|
||||
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
|
||||
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
|
@ -143,74 +139,73 @@ function copydb() {
|
|||
fi
|
||||
done
|
||||
|
||||
echo -e "\nAll tables have been created, going to create the views..\n"
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements[@]}
|
||||
if [[ $num_tables -gt 0 ]]; then
|
||||
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
|
||||
else
|
||||
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
|
||||
fi
|
||||
|
||||
# Time to loop through the views and create them.
|
||||
# At this point all table-schemas should have been created.
|
||||
|
||||
previous_num_of_views_to_retry=${#all_create_view_statements}
|
||||
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
# Make Impala aware of the new tables, so it knows them when creating the views.
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
|
||||
else
|
||||
echo -e "\nDB '${db}' does not contain any views.\n"
|
||||
fi
|
||||
|
||||
level_counter=0
|
||||
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do
|
||||
while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
|
||||
((level_counter++))
|
||||
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
|
||||
# In this case, we should retry creating this particular view again.
|
||||
should_retry_create_view_statements=()
|
||||
new_num_of_views_to_retry=0
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n"
|
||||
should_retry_create_view_statements+=("$create_view_statement")
|
||||
echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
|
||||
((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
|
||||
else
|
||||
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
|
||||
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
|
||||
fi
|
||||
done
|
||||
|
||||
new_num_of_views_to_retry=${#should_retry_create_view_statements}
|
||||
all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
|
||||
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
|
||||
|
||||
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
|
||||
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
|
||||
return 3
|
||||
exit 5
|
||||
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
|
||||
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n"
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
|
||||
else
|
||||
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
|
||||
fi
|
||||
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views.
|
||||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
done
|
||||
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
|
||||
sleep 1
|
||||
|
||||
echo -e "\nComputing stats for tables..\n"
|
||||
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
|
||||
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
|
||||
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
|
||||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
|
||||
fi
|
||||
done
|
||||
|
||||
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
|
||||
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
|
||||
echo -e "\nAll entities have been copied to Impala cluster.\n"
|
||||
else
|
||||
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
|
||||
rm -f error.log
|
||||
return 4
|
||||
exit 6
|
||||
fi
|
||||
|
||||
rm -f error.log
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Additional relations
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
-------------------------------------------
|
||||
--- Extra tables, mostly used by indicators
|
||||
|
||||
|
|
|
@ -249,7 +249,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
|
|||
left semi join dd on dd.id=pd.datasource
|
||||
union all
|
||||
select ra.id, 1 as is_gold
|
||||
from ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/
|
||||
from ${stats_db_name}.result_accessroute ra where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/
|
||||
|
||||
drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as
|
||||
|
@ -294,7 +294,7 @@ left outer join (
|
|||
join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=p.id
|
||||
left outer join ${stats_db_name}.result_accessroute ra on ra.id=p.id
|
||||
where indi_gold.is_gold=0 and
|
||||
((d.type like '%Journal%' and ri.accessright not in ('Closed Access', 'Restricted', 'Not Available') and ri.license is not null) or ra.accessroute='hybrid')) tmp on pd.i=tmp.id; /*EOS*/
|
||||
((d.type like '%Journal%' and ri.accessright not in ('Closed Access', 'Restricted', 'Not Available') and ri.license is not null) or ra.accessroute='hybrid')) tmp on p.id=tmp.id; /*EOS*/
|
||||
|
||||
drop table if exists ${stats_db_name}.indi_org_fairness purge; /*EOS*/
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as
|
||||
|
@ -380,7 +380,7 @@ CREATE TEMPORARY VIEW allresults as
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub as
|
||||
create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
|
||||
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
|
||||
from allresults ar join result_fair rf
|
||||
on rf.organization=ar.organization; /*EOS*/
|
||||
|
@ -639,7 +639,7 @@ from ${stats_db_name}.publication p
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_result_with_pid as
|
||||
create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
|
||||
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
|
||||
from ${stats_db_name}.result p
|
||||
left outer join (
|
||||
|
@ -653,7 +653,7 @@ group by rf.id; /*EOS*/
|
|||
|
||||
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
|
||||
|
||||
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity as
|
||||
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
|
||||
select distinct p.id as id, coalesce(is_interdisciplinary, 0)
|
||||
as is_interdisciplinary
|
||||
from pub_fos_totals p
|
||||
|
@ -1006,14 +1006,14 @@ left outer join (
|
|||
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
|
||||
|
||||
create table ${stats_db_name}.result_country stored as parquet as
|
||||
select distinct *
|
||||
select distinct id, country
|
||||
from (
|
||||
select ro.id, o.country
|
||||
from ${stats_db_name}.result_organization ro
|
||||
left outer join ${stats_db_name}.organization o on o.id=ro.organization
|
||||
union all
|
||||
select rp.id, f.country
|
||||
from ${stats_db_name}.result_projects
|
||||
from ${stats_db_name}.result_projects rp
|
||||
left outer join ${stats_db_name}.project p on p.id=rp.project
|
||||
left outer join ${stats_db_name}.funder f on f.name=p.funder
|
||||
) rc
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
----------------------------------------------------
|
||||
-- Shortcuts for various definitions in stats db ---
|
||||
----------------------------------------------------
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
|
||||
-- peer reviewed)
|
||||
drop table if exists ${stats_db_name}.result_tmp;
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
set mapred.job.queue.name=analytics;
|
||||
|
||||
--------------------------------------------------------------
|
||||
--------------------------------------------------------------
|
||||
-- Publication table/view and Publication related tables/views
|
||||
|
|
|
@ -368,6 +368,7 @@
|
|||
${sparkClusterOpts}
|
||||
${sparkResourceOpts}
|
||||
${sparkApplicationOpts}
|
||||
--queue analytics
|
||||
</spark-opts>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
|
||||
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql</arg>
|
||||
|
|
|
@ -30,6 +30,10 @@
|
|||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>mapred.child.java.opts</name>
|
||||
<value>-Xmx16g</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
|
|
2
pom.xml
2
pom.xml
|
@ -960,7 +960,7 @@
|
|||
<commons.logging.version>1.1.3</commons.logging.version>
|
||||
<commons-validator.version>1.7</commons-validator.version>
|
||||
<dateparser.version>1.0.7</dateparser.version>
|
||||
<dhp-schemas.version>[6.1.2-SNAPSHOT]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[6.1.2]</dhp-schemas.version>
|
||||
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||
<dhp.guava.version>11.0.2</dhp.guava.version>
|
||||
|
|
Loading…
Reference in New Issue