merge branch with master

This commit is contained in:
Miriam Baglioni 2020-11-05 16:31:18 +01:00
commit f8e9bda24c
8 changed files with 267 additions and 165 deletions

View File

@ -1,14 +1,15 @@
package eu.dnetlib.dhp.common;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.hadoop.fs.*;
package eu.dnetlib.dhp.common;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.hadoop.fs.*;
public class MakeTarArchive implements Serializable {
private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
@ -113,7 +114,4 @@ public class MakeTarArchive implements Serializable {
return current_size;
}
}

View File

@ -9,7 +9,6 @@ import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@Disabled
public class HttpConnectorTest {

View File

@ -320,6 +320,7 @@
<ok to="join_extend"/>
<error to="Kill"/>
</action>
<action name="extend_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -344,6 +345,7 @@
<ok to="join_extend"/>
<error to="Kill"/>
</action>
<action name="extend_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -407,7 +409,6 @@
<error to="Kill"/>
</action>
<action name="send_zenodo">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>
@ -424,8 +425,6 @@
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -357,10 +357,8 @@
<error to="Kill"/>
</action>
<join name="join_dump" to="fork_context"/>
<fork name="fork_context">
<path start="create_entities_fromcontext"/>
<path start="create_relation_fromcontext"/>
@ -389,7 +387,6 @@
<error to="Kill"/>
</action>
<action name="create_relation_fromorgs">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -418,7 +415,6 @@
<join name="join_context" to="fork_extract_relations"/>
<fork name="fork_extract_relations">
<path start="rels_from_pubs"/>
<path start="rels_from_dats"/>
@ -530,7 +526,6 @@
<error to="Kill"/>
</action>
<join name="join_extract_relations" to="collect_and_save"/>
<action name="collect_and_save">
@ -569,7 +564,6 @@
<error to="Kill"/>
</action>
<action name="send_zenodo">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.SendToZenodoHDFS</main-class>

View File

@ -6,11 +6,36 @@ import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import eu.dnetlib.dhp.schema.oaf.Relation
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
object SparkGenerateScholixIndex {
def getScholixAggregator(): Aggregator[(String, Scholix), Scholix, Scholix] = new Aggregator[(String, Scholix), Scholix, Scholix]{
override def zero: Scholix = new Scholix()
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
b.mergeFrom(a._2)
b
}
override def merge(wx: Scholix, wy: Scholix): Scholix = {
wx.mergeFrom(wy)
wx
}
override def finish(reduction: Scholix): Scholix = reduction
override def bufferEncoder: Encoder[Scholix] =
Encoders.kryo(classOf[Scholix])
override def outputEncoder: Encoder[Scholix] =
Encoders.kryo(classOf[Scholix])
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkGenerateScholixIndex.getClass.getResourceAsStream("/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json")))
parser.parseArgument(args)
@ -40,7 +65,7 @@ object SparkGenerateScholixIndex {
(relation.getTarget, Scholix.generateScholixWithSource(summary,relation))
}).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_source")
}).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_source")
val sTarget:Dataset[(String,Scholix)] = spark.read.load(s"$workingDirPath/scholix_source").as[(String, Scholix)]
@ -53,9 +78,16 @@ object SparkGenerateScholixIndex {
scholix.generateIdentifier()
scholix.generatelinkPublisher()
scholix
}).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix")
}).repartition(6000).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix_r")
val finalScholix:Dataset[Scholix] = spark.read.load(s"$workingDirPath/scholix_r").as[Scholix]
finalScholix.map(d => (d.getIdentifier, d))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(getScholixAggregator().toColumn)
.map(p => p._2)
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/scholix")
}

View File

@ -5,6 +5,8 @@ import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary;
@ -91,13 +93,91 @@ public class Scholix implements Serializable {
s.setSource(ScholixResource.fromSummary(scholixSummary));
s.setIdentifier(rel.getTarget());
// ScholixResource mockTarget = new ScholixResource();
// mockTarget.setDnetIdentifier(rel.getTarget());
// s.setTarget(mockTarget);
// s.generateIdentifier();
return s;
}
private List<ScholixEntityId> mergeScholixEntityId(final List<ScholixEntityId> a, final List<ScholixEntityId> b) {
final List<ScholixEntityId> m = new ArrayList<>(a);
if (b != null)
b.forEach(s -> {
int tt = (int) m.stream().filter(t -> t.getName().equalsIgnoreCase(s.getName())).count();
if (tt == 0) {
m.add(s);
}
});
return m;
}
private List<ScholixIdentifier> mergeScholixIdnetifier(final List<ScholixIdentifier> a,
final List<ScholixIdentifier> b) {
final List<ScholixIdentifier> m = new ArrayList<>(a);
if (b != null)
b.forEach(s -> {
int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count();
if (tt == 0) {
m.add(s);
}
});
return m;
}
private List<ScholixCollectedFrom> mergeScholixCollectedFrom(final List<ScholixCollectedFrom> a,
final List<ScholixCollectedFrom> b) {
final List<ScholixCollectedFrom> m = new ArrayList<>(a);
if (b != null)
b.forEach(s -> {
int tt = (int) m
.stream()
.filter(t -> t.getProvider().getName().equalsIgnoreCase(s.getProvider().getName()))
.count();
if (tt == 0) {
m.add(s);
}
});
return m;
}
private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) {
ScholixRelationship result = new ScholixRelationship();
result.setName(StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName());
result.setInverse(StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse());
result.setSchema(StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema());
return result;
}
private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) {
final ScholixResource result = new ScholixResource();
result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom()));
result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator()));
result
.setDnetIdentifier(
StringUtils.isBlank(a.getDnetIdentifier()) ? b.getDnetIdentifier() : a.getDnetIdentifier());
result.setIdentifier(mergeScholixIdnetifier(a.getIdentifier(), b.getIdentifier()));
result.setObjectType(StringUtils.isNotBlank(a.getObjectType()) ? a.getObjectType() : b.getObjectType());
result
.setObjectSubType(
StringUtils.isNotBlank(a.getObjectSubType()) ? a.getObjectSubType() : b.getObjectSubType());
result.setPublisher(mergeScholixEntityId(a.getPublisher(), b.getPublisher()));
result
.setPublicationDate(
StringUtils.isNotBlank(a.getPublicationDate()) ? a.getPublicationDate() : b.getPublicationDate());
result.setTitle(StringUtils.isNotBlank(a.getTitle()) ? a.getTitle() : b.getTitle());
return result;
}
public void mergeFrom(final Scholix other) {
linkprovider = mergeScholixEntityId(linkprovider, other.getLinkprovider());
publisher = mergeScholixEntityId(publisher, other.getPublisher());
if (StringUtils.isEmpty(publicationDate))
publicationDate = other.getPublicationDate();
relationship = mergeRelationships(relationship, other.getRelationship());
source = mergeResource(source, other.getSource());
target = mergeResource(target, other.getTarget());
generateIdentifier();
}
public void generatelinkPublisher() {
Set<String> publisher = new HashSet<>();
if (source.getPublisher() != null)

View File

@ -108,7 +108,7 @@
<arg>-m</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingDirPath}</arg>
</spark>
<ok to="DropAndCreateIndex"/>
<ok to="End"/>
<error to="Kill"/>
</action>