Added stuff

This commit is contained in:
Sandro La Bruzzo 2023-04-12 10:09:05 +02:00
parent 0dc33def41
commit 00ed8d5b31
26 changed files with 1637 additions and 25 deletions

View File

@ -15,7 +15,8 @@ def extract_argument(path):
for line in f:
if not line.startswith("#"):
s = line.strip().split("=")
arguments[s[0].strip()] = s[1].strip()
if len(s) ==2:
arguments[s[0].strip()] = s[1].strip()
return arguments
@ -111,8 +112,12 @@ if __name__ == "__main__":
j_name = ",".join(["sandro_nb/"+ item for item in jars])
name = main_jar_path.replace("target/", "")
jar_section = ""
if len(jars) > 0:
jar_section =f"--jars {j_name}"
class_args = extracting_class_args(script_argument)
command = f"spark2-submit --master yarn --jars {j_name} --class {script_argument['reference_class']} sandro_nb/{name} {class_args}"
command = f"spark2-submit --master yarn {jar_section} --executor-memory 4G --class {script_argument['reference_class']} --conf \"spark.sql.shuffle.partitions=10000\" sandro_nb/{name} {class_args}"
print(f"executing command {command}")

View File

@ -135,7 +135,13 @@
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>3.15.0</version>
<version>2.12.1-patched</version>
</dependency>
<!-- JAR NEED -->
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>1.2.5-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>

View File

@ -14,23 +14,7 @@ trait SparkApp extends Serializable {
* @param args the list of arguments.properties
*/
def parseArguments(args: Array[String]): mutable.Map[String, String] = {
var currentVariable: String = null
val argumentMap: mutable.Map[String, String] = mutable.Map()
args.zipWithIndex.foreach {
case (x, i) =>
if (i % 2 == 0) {
// ERROR in case the syntax is wrong
if (!x.startsWith("-")) throw new IllegalArgumentException("wrong input syntax expected -variable_name value")
if (x.startsWith("--"))
currentVariable = x.substring(2)
else
currentVariable = x.substring(1)
}
else argumentMap += (currentVariable -> x)
}
argumentMap
SparkUtility.parseArguments(args)
}
/** Here all the spark applications runs this method

View File

@ -0,0 +1,27 @@
package com.sandro.app
import scala.collection.mutable
object SparkUtility {
def parseArguments(args: Array[String]): mutable.Map[String, String] = {
var currentVariable: String = null
val argumentMap: mutable.Map[String, String] = mutable.Map()
args.zipWithIndex.foreach {
case (x, i) =>
if (i % 2 == 0) {
// ERROR in case the syntax is wrong
if (!x.startsWith("-")) throw new IllegalArgumentException("wrong input syntax expected -variable_name value")
if (x.startsWith("--"))
currentVariable = x.substring(2)
else
currentVariable = x.substring(1)
}
else argumentMap += (currentVariable -> x)
}
argumentMap
}
}

View File

@ -0,0 +1,119 @@
package com.sandro.app.fs;
import com.amazonaws.services.dynamodbv2.xspec.M;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.sandro.app.SparkUtility;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import scala.collection.mutable.Map;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Objects;
public class FsChecks {
public static Configuration getHadoopConfiguration(String nameNode) {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", nameNode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
return conf;
}
private static MDStoreInfo extractPath(final String path, final String basePath) {
int res = path.indexOf(basePath);
if (res >0){
String[] split = path.substring(res).split("/");
if (split.length > 2) {
final String ts = split[split.length -1];
final String mdStore = split[split.length -2];
return new MDStoreInfo(mdStore, null, Long.parseLong(ts));
}
}
return null;
}
public static void main(String[] args) throws IOException {
Map<String, String> parsedArgs = SparkUtility.parseArguments(args);
final String namenode = parsedArgs.get("namenode").getOrElse(null);
final String master = parsedArgs.getOrElse("master", null);
final SparkConf conf = new SparkConf();
final SparkSession spark =SparkSession
.builder()
.config(conf)
.master(master)
.appName(FsChecks.class.getSimpleName())
.getOrCreate();
spark.sparkContext().setLogLevel("WARN");
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(namenode));
final String stores =IOUtils.toString(Objects.requireNonNull(FsChecks.class.getResourceAsStream("/mdstore_info.json")));
final ObjectMapper mapper = new ObjectMapper();
final List<MDStoreInfo> storesINfo =mapper.readValue(stores, new TypeReference<List<MDStoreInfo>>(){});
final String basePath ="/user/sandro.labruzzo/stores/";
Path p = new Path(basePath);
final java.util.Map<String, MDStoreInfo> hdfs_store= new HashMap<>();
final RemoteIterator<LocatedFileStatus> ls = fileSystem.listFiles(p, true);
while (ls.hasNext()){
String current =ls.next().getPath().toString();
final MDStoreInfo info = extractPath(current, basePath);
if (info!= null) {
hdfs_store.put(info.getMdstore(), info);
}
}
storesINfo.stream().filter(s ->s.getLatestTimestamp() != null).forEach( s ->{
if (!hdfs_store.containsKey(s.getMdstore())) {
System.out.printf("Adding mdstore %s\n",s.getMdstore());
try {
fileSystem.mkdirs(new Path(basePath+s.getMdstore()));
fileSystem.create(new Path(basePath+s.getMdstore()+"/"+s.getLatestTimestamp()), true);
System.out.printf("Added path %s/%s/%d\n",basePath, s.getMdstore(),s.getLatestTimestamp());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
else {
final MDStoreInfo current = hdfs_store.get(s.getMdstore());
if (s.getLatestTimestamp() > current.getLatestTimestamp()) {
System.out.println("Updating MDStore "+s.getMdstore());
final String rmPath = String.format("%s%s/%d", basePath, current.getMdstore(), current.getLatestTimestamp());
try {
System.out.println("deleting "+rmPath);
fileSystem.create(new Path(basePath+s.getMdstore()+"/"+s.getLatestTimestamp()), true);
fileSystem.delete(new Path(rmPath), true);
} catch (IOException e) {
throw new RuntimeException("Unable to remove path "+rmPath, e);
}
}
}
});
}
}

View File

@ -0,0 +1,53 @@
package com.sandro.app.fs;
public class MDStoreInfo {
private String mdstore;
private String currentId;
private Long latestTimestamp;
public MDStoreInfo() {
}
public MDStoreInfo(String mdstore, String currentId, Long latestTimestamp) {
this.mdstore = mdstore;
this.currentId = currentId;
this.latestTimestamp = latestTimestamp;
}
public String getMdstore() {
return mdstore;
}
public MDStoreInfo setMdstore(String mdstore) {
this.mdstore = mdstore;
return this;
}
public String getCurrentId() {
return currentId;
}
public MDStoreInfo setCurrentId(String currentId) {
this.currentId = currentId;
return this;
}
public Long getLatestTimestamp() {
return latestTimestamp;
}
public MDStoreInfo setLatestTimestamp(Long latestTimestamp) {
this.latestTimestamp = latestTimestamp;
return this;
}
@Override
public String toString() {
return "MDStoreInfo{" +
"mdstore='" + mdstore + '\'' +
", currentId='" + currentId + '\'' +
", latestTimestamp=" + latestTimestamp +
'}';
}
}

View File

@ -0,0 +1,83 @@
package com.sandro.app.fs
import scala.collection.mutable
import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
case class OAFInfo(datasourcePrefix: String ,cobjCategory: String ,openAccess: List[String], identifierTypes: List[String] , hostedBy: List[String], projectid:String) {}
/** @param xml
*/
class OAFParser(xml: XMLEventReader) {
def extractAttributes(attrs: MetaData, key: String): String = {
val res = attrs.get(key)
if (res.isDefined) {
val s = res.get
if (s != null && s.nonEmpty)
s.head.text
else
null
} else null
}
def extractStats(): OAFInfo = {
var currNode: String = null
var datasourcePrefix: String = null
var cobjCategory: String= null
val openAccess: mutable.Set[String] = mutable.Set()
val identifierTypes: mutable.Set[String] = mutable.Set()
val hostedBy: mutable.Set[String] = mutable.Set()
var projectid:String = null
var node_status:String = null
while (xml.hasNext) {
xml.next match {
case EvElemStart(_, label, attrs, _) =>
currNode = label
label match {
case "datasourceprefix" => node_status = "datasourceprefix"
case "CobjCategory" => node_status = "CobjCategory"
case "accessrights" => node_status = "accessrights"
case "projectid" => node_status = "projectid"
case "hostedBy" =>
val it = extractAttributes(attrs, "name")
if (it != null && it.nonEmpty)
hostedBy += it
case "identifier" =>
val it = extractAttributes(attrs, "identifierType")
if (it != null && it.nonEmpty)
identifierTypes += it
case _ =>
}
case EvElemEnd(_, label) =>
label match {
case "datasourceprefix" => node_status = null
case "CobjCategory" => node_status = null
case "accessrights" => node_status = null
case "hostedBy" => node_status = null
case "projectid" => node_status = null
case _ =>
}
case EvText(text) =>
if (node_status != null && text.trim.nonEmpty)
node_status match {
case "projectid" =>
projectid = text
case "datasourceprefix" =>
datasourcePrefix = text
case "CobjCategory" =>
cobjCategory = text
case "accessrights" =>
openAccess += text
case _ =>
}
case _ =>
}
}
OAFInfo(datasourcePrefix = datasourcePrefix, cobjCategory = cobjCategory, openAccess = openAccess.toList, identifierTypes = identifierTypes.toList, hostedBy = hostedBy.toList,projectid)
}
}

View File

@ -0,0 +1,89 @@
package com.sandro.app.fs;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
public class OafStat implements Serializable {
private String datasourcePrefix;
private String cobjCategory;
private List<String> openAccess;
private List<String> identifierTypes;
private List<String> hostedBy;
public String getDatasourcePrefix() {
return datasourcePrefix;
}
public OafStat setDatasourcePrefix(String datasourcePrefix) {
this.datasourcePrefix = datasourcePrefix;
return this;
}
public String getCobjCategory() {
return cobjCategory;
}
public OafStat setCobjCategory(String cobjCategory) {
this.cobjCategory = cobjCategory;
return this;
}
private void add_value_to_list(final String value, final List<String> l) {
if (value==null || StringUtils.isEmpty(value))
return;
final String normalized_value = value.toLowerCase().trim();
if (l.stream().anyMatch(s-> s.equalsIgnoreCase(normalized_value))){
l.add(normalized_value);
}
}
public List<String> getOpenAccess() {
return openAccess;
}
public List<String> getIdentifierTypes() {
return identifierTypes;
}
public List<String> getHostedBy() {
return hostedBy;
}
public void addIdentifierType(final String value) {
if (identifierTypes== null)
identifierTypes = new ArrayList<>();
add_value_to_list(value, identifierTypes);
}
public void addOpenAccess(final String value) {
if (openAccess == null)
openAccess= new ArrayList<>();
add_value_to_list(value, openAccess);
}
public void addHostedBy(final String value) {
if (hostedBy == null)
hostedBy = new ArrayList<>();
add_value_to_list(value, hostedBy);
}
public OafStat setOpenAccess(List<String> openAccess) {
this.openAccess = openAccess;
return this;
}
public OafStat setIdentifierTypes(List<String> identifierTypes) {
this.identifierTypes = identifierTypes;
return this;
}
public OafStat setHostedBy(List<String> hostedBy) {
this.hostedBy = hostedBy;
return this;
}
}

View File

@ -0,0 +1,48 @@
package eu.dnetlib.doiboost.crossref
import com.sandro.app.AbstractScalaApplication
import org.apache.commons.cli.MissingArgumentException
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.count
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods
import org.slf4j.{Logger, LoggerFactory}
class CrossrefStatJob ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def extractTypologies(spark:SparkSession, path:String):Unit = {
import spark.implicits._
val df =spark.read.text(path).as[String].map(s => CrossrefUtils.extractTypeSubtype(s)).distinct()
spark.read.text(path).as[String].map(s => CrossrefUtils.extractTypeSubtype(s)).groupBy("_1", "_2").agg(count("_1").alias("total")).show(200,false)
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val path: String = argumentMap.get("path").orNull
if (path == null) throw new MissingArgumentException("Missing argument path")
extractTypologies(spark, path)
}
}
object CrossrefStatJob {
val log: Logger = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
new CrossrefStatJob(args = args, log = log).initialize().run()
}
}

View File

@ -0,0 +1,121 @@
package eu.dnetlib.doiboost.crossref
import org.json4s
import org.json4s.JsonAST.JField
import org.json4s.{DefaultFormats, JObject, JString}
import org.json4s.jackson.JsonMethods
case class CrossrefDT(doi: String, json: String, timestamp: Long) {}
object CrossrefUtils {
def extractInfo(input:String):(String,String, String,String,String) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
val relType = (json \"relationship"\"name").extract[String]
val sourceType = (json \"source"\"objectType").extract[String]
val sourcesubType = (json \"source"\"objectSubType").extract[String]
val targetType = (json \ "target" \ "objectType").extract[String]
val targetsubType = (json \ "target" \ "objectSubType").extract[String]
(sourceType, sourcesubType, relType, targetType, targetsubType)
}
def extractST(input: String): (String, String,String, Boolean) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
val source = (json \ "source").extractOrElse[String](null)
val target = (json \ "target").extractOrElse[String](null)
val relClass = (json \ "relClass").extractOrElse[String](null)
val dbi = (json \ "dataInfo" \"deletedbyinference").extractOrElse[Boolean](false)
(source, target,relClass, dbi)
}
def extractSourceTargetId(input: String): (String, String) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
val source = (json \ "source" \ "dnetIdentifier").extractOrElse[String](null)
val target = (json \ "target" \"dnetIdentifier").extractOrElse[String](null)
(source, target)
}
def extractStats(input: String): (String, String, String) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
val source = (json \ "source"\"objectType").extractOrElse[String](null)
val target = (json \ "target"\"objectType").extractOrElse[String](null)
val relClass = (json \ "relationship"\"name").extractOrElse[String](null)
(source, target, relClass)
}
def extractIdType(input: String): (String, String) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
val source = (json \ "id" ).extractOrElse[String](null)
val target = (json \ "typology" ).extractOrElse[String](null)
(source, target)
}
// def extractId(input: String): String = {
// implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
// lazy val json: json4s.JValue = JsonMethods.parse(input)
// val source = (json \ "id").extractOrElse[String](null)
//
//
// source
//
// }
def extractTypeSubtype(input:String):(String,String) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
val objectType = (json \ "type").extractOrElse[String](null)
val objectSubType = (json \ "subtype").extractOrElse[String](null)
(objectType, objectSubType)
}
def extractCF(input: String): List[(String, String)] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
val id:String = (json \ "id").extract[String]
val l:List[(String, String)] =for {
JObject(cf) <- json\"collectedfrom"
JField("value", JString(cf_name)) <- cf
} yield (cf_name, id)
l
}
def extractId(input:String):String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
(json\ "id").extract[String]
}
}
//scholix.joinWith(id, scholix("_2").equalTo(id("_1")), "leftouter").map(s => {
// if (s._2 != null)
// ( s._1.getString(1) , s._2.getString(1) )
// else
// ("publication", s._1.getString(1))
//} ).where("_1 = 'UKN'").write.mode("Overwrite").save("scholix_prod_join")

View File

@ -0,0 +1,108 @@
package eu.dnetlib.doiboost.mag
import scala.collection.JavaConverters._
import eu.dnetlib.dhp.schema.oaf.{KeyValue, Relation}
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods
object MagUtility {
val DOI_PREFIX_REGEX = "(^10\\.|\\/10.)"
val DOI_PREFIX = "10."
case class MagPapers(
PaperId: Long,
Rank: Integer,
Doi: String,
DocType: String,
PaperTitle: String,
OriginalTitle: String,
BookTitle: String,
Year: Option[Integer],
Date: Option[java.sql.Timestamp],
Publisher: String,
JournalId: Option[Long],
ConferenceSeriesId: Option[Long],
ConferenceInstanceId: Option[Long],
Volume: String,
Issue: String,
FirstPage: String,
LastPage: String,
ReferenceCount: Option[Long],
CitationCount: Option[Long],
EstimatedCitation: Option[Long],
OriginalVenue: String,
FamilyId: Option[Long],
CreatedDate: java.sql.Timestamp
) {}
case class MagPaperCitation(
PaperId: Option[Long],
PaperReferenceId: Option[Long],
CitationContext: Option[String]
) {}
def extractST(input: String): List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = JsonMethods.parse(input)
val source = (json \ "source").extract[String]
val target = (json \ "source").extract[String]
List(source, target)
}
def createCiteRealtion(from:String, to:String):List[Relation] = {
val cf = new KeyValue
cf.setValue("Microsoft Academic Graph")
cf.setKey("10|openaire____::" + IdentifierFactory.md5("microsoft"))
val b = new Relation
val t = IdentifierFactory.idFromPid("50", "doi", from, true)
val s = IdentifierFactory.idFromPid("50", "doi", to, true)
b.setSource(s)
b.setTarget(t)
b.setRelType("resultResult")
b.setSubRelType("citation")
b.setRelClass("IsCitedBy")
b.setCollectedfrom(List(cf).asJava)
val a = new Relation
val source = IdentifierFactory.idFromPid("50", "doi", from, true)
val target = IdentifierFactory.idFromPid("50", "doi", to, true)
a.setSource(source)
a.setTarget(target)
a.setRelType("resultResult")
a.setSubRelType("citation")
a.setRelClass("Cites")
a.setCollectedfrom(List(cf).asJava)
List(a,b)
}
def isEmpty(x: String) = x == null || x.trim.isEmpty
def normalizeDoi(input: String): String = {
if (input == null)
return null
val replaced = input
.replaceAll("(?:\\n|\\r|\\t|\\s)", "")
.toLowerCase
.replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX)
if (isEmpty(replaced))
return null
if (replaced.indexOf("10.") < 0)
return null
val ret = replaced.substring(replaced.indexOf("10."))
if (!ret.startsWith(DOI_PREFIX))
return null
return ret
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.doiboost.mag
import com.fasterxml.jackson.databind.ObjectMapper
import com.sandro.app.AbstractScalaApplication
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.doiboost.mag.MagUtility.{MagPaperCitation, MagPapers, normalizeDoi}
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
class SparkMagCitation ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def extractCitationRelationDOI(spark: SparkSession, magBasePath: String, workingDir: String) = {
import spark.implicits._
val papersDs: Dataset[(Long, String)] = spark.read.load(s"$magBasePath/Papers").select("PaperId", "Doi").where(col("PaperId").isNotNull).where(col("Doi").isNotNull).as[(Long, String)]
val citationDS: Dataset[(Long, Long)] = spark.read.load(s"$magBasePath/PaperCitationContexts").select("PaperId", "PaperReferenceId").where(col("PaperId").isNotNull).where(col("PaperReferenceId").isNotNull).as[(Long, Long)]
val DOI_ID = papersDs.map(s => (s._1, normalizeDoi(s._2).toLowerCase.trim)).filter(s => s._2 != null)
citationDS.joinWith(DOI_ID, citationDS("PaperId").equalTo(DOI_ID("_1"))).map(s => (s._2._2, s._1._2)).as[(String, Long)].write.mode(SaveMode.Overwrite).save(s"$workingDir/citation_one_side")
val oneSideRelationDs = spark.read.load(s"$workingDir/citation_one_side").as[(String, Long)]
oneSideRelationDs.joinWith(DOI_ID, oneSideRelationDs("_2").equalTo(DOI_ID("_1")), "inner").map(s => (s._1._1, s._2._2)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDir/citation_mag_doi_doi")
}
def createRelations(spark:SparkSession, workingDir:String) :Unit = {
val mapper = new ObjectMapper()
import spark.implicits._
implicit val resultEncoder:Encoder[Relation] = Encoders.kryo[Relation]
val ctM = spark.read.load(s"$workingDir/citation_mag_doi_doi").as[(String, String)]
ctM.flatMap(t => MagUtility.createCiteRealtion(t._1, t._2)).as[Relation].map(m => mapper.writeValueAsString(m)).write.mode(SaveMode.Overwrite)
.option("compression", "gzip").text(s"$workingDir/relations")
}
def checkRelation(spark:SparkSession, workingDir:String) :Unit = {
import spark.implicits._
spark.read.text(s"$workingDir/relations").as[String].flatMap(s => MagUtility.extractST(s)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDir/distinctID")
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
//extractCitationRelationDOI(spark,"/data/doiboost/input/mag/dataset", "/user/sandro.labruzzo/mag")
//createRelations(spark,"/user/sandro.labruzzo/mag")
checkRelation(spark,"/user/sandro.labruzzo/mag")
}
}
object SparkMagCitation {
val log:Logger = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
new SparkMagCitation(args,log).initialize().run()
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.graph.raw
import com.sandro.app.AbstractScalaApplication
import eu.dnetlib.doiboost.crossref.CrossrefUtils
import org.apache.commons.cli.MissingArgumentException
import org.apache.spark.sql.{Dataset, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.sql.functions.{count, desc}
class CheckOpenAireFailure ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def count_collectedFromByEntity(spark:SparkSession,base_path_1:String,base_path_2:String): Unit = {
import spark.implicits._
// val l_types = List("dataset", "datasource","organization","otherresearchproduct","project","publication","software")
println(s"Publication in $base_path_1")
spark.read.text(s"$base_path_1/publication").as[String].flatMap(s => CrossrefUtils.extractCF(s)).groupBy("_1").agg(count("_2").alias("Total")).orderBy(desc("total")).show(100,false)
println(s"Publication in $base_path_2")
spark.read.text(s"$base_path_2/publication").as[String].flatMap(s => CrossrefUtils.extractCF(s)).groupBy("_1").agg(count("_2").alias("Total")).orderBy(desc("total")).show(100, false)
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
count_collectedFromByEntity(spark, "/tmp/beta_provision/graph/00_prod_graph_aggregator","/tmp/prod_provision/graph/00_graph_aggregator" )
}
}
object CheckOpenAireFailure {
val log = LoggerFactory.getLogger(getClass)
def main(args: Array[String]): Unit = {
new CheckOpenAireFailure(args, log).initialize().run();
}
}

View File

@ -0,0 +1,25 @@
package eu.dnetlib.graph.raw;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
public class CheckPath {
public static void main(String[] args) {
final SparkConf conf= new SparkConf();
final SparkSession spark = SparkSession
.builder()
.config(conf)
.appName(CheckPath.class.getSimpleName())
.master("yarn")
.getOrCreate();
final String sp ="/data/aggregator_contents/PROD_for_BETA/mdstore/*/*";
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final Long total = sc.sequenceFile(sp, Text.class, Text.class).count();
System.out.println("total = " + total);
}
}

View File

@ -0,0 +1,86 @@
package eu.dnetlib.graph.raw
import com.sandro.app.AbstractScalaApplication
import com.sandro.app.fs.{OAFInfo, OafStat}
import eu.dnetlib.scholix.{DHPUtils, Measurement}
import org.apache.commons.cli.MissingArgumentException
import org.apache.hadoop.io.Text
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.slf4j.{Logger, LoggerFactory}
class GenerateMDStoreStats ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def hasHostedBy(o: List[String]): Int = {
if (o == null || o.isEmpty)
0
else
o.count(h => !h.equalsIgnoreCase("unknown repository"))
}
def generateMDStoreStats(spark:SparkSession,statsDBPath:String, ts:Long, targetPath:String) :Unit = {
import spark.implicits._
val df:Dataset[OAFInfo] = spark.read.load(statsDBPath).as[OAFInfo]
log.error("Generating Total Item measurement")
df.groupBy("datasourcePrefix").agg(count("datasourcePrefix").as("Total"))
.map(r => Measurement(name = "Total Item", nsprefix = r.getAs[String]("datasourcePrefix"), timestamp = ts, value = r.getAs[Long]("Total")))
.write.mode(SaveMode.Overwrite).save(targetPath)
log.error("Generating Total Item related to a project measurement")
df.groupBy("datasourcePrefix").agg(count("projectid").alias("ItemWithProject"))
.map(r => {
Measurement(name = "Project Relation count", nsprefix = r.getAs[String]("datasourcePrefix"), timestamp = ts, value = r.getAs[Long]("ItemWithProject"))
}).write.mode(SaveMode.Append).save(targetPath)
df.map(o => (o.datasourcePrefix, hasHostedBy(o.hostedBy)))
.groupBy("_1")
.agg(sum("_2"))
.map(r =>
Measurement("Hosted By Record count", nsprefix = r.getString(0), timestamp = ts, value =r.getAs[Long](1) )
).write.mode(SaveMode.Append).save(targetPath)
}
def generateInfoOaf(spark:SparkSession, basePath:String, statsDBPath:String):Unit = {
val sc = spark.sparkContext
import spark.implicits._
println(s"base Path is $basePath")
val mdstores :RDD[OAFInfo] = sc.sequenceFile(basePath, classOf[Text],classOf[Text]).map(x=>x._2.toString).map(x=> DHPUtils.convertTOOAFStat(x))
val df:Dataset[OAFInfo] =spark.createDataset(mdstores)
df.write.mode(SaveMode.Overwrite).save(statsDBPath)
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val path:String = argumentMap.get("path").orNull
if (path == null) throw new MissingArgumentException("Missing argument path")
println(s"base Path is $path")
generateInfoOaf(spark, path, "/user/sandro.labruzzo/prod_for_beta_stats")
generateMDStoreStats(spark, "/user/sandro.labruzzo/prod_for_beta_stats", System.currentTimeMillis(),"/user/sandro.labruzzo/prod_for_beta_mesaurement")
spark.close()
}
}
object GenerateMDStoreStats{
val log: Logger = LoggerFactory.getLogger(GenerateMDStoreStats.getClass)
def main(args: Array[String]): Unit = {
new GenerateMDStoreStats(args,log ).initialize().run()
}
}

View File

@ -0,0 +1,59 @@
package eu.dnetlib.scholix
import com.sandro.app.AbstractScalaApplication
import org.apache.spark.sql.functions.{count, desc}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
class CheckEBIStats( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def extractPidSchema(input:String) :String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val source = (json \ "source").extractOrElse[String](null)
if (source != null) {
null
}
else {
val l: List[String] = for {
JObject(pids) <- json \\ "pid"
JField("qualifier", JObject(qualifier)) <- pids
JField("classid", JString(classid)) <- qualifier
} yield classid
l.head
}
}
def listPidType(spark:SparkSession, path:String) :Unit = {
import spark.implicits._
val df:Dataset[String] = spark.read.text(path).as[String]
df.map(extractPidSchema).filter(s=> s!=null).groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)
}
override def run(): Unit = {
val path = argumentMap("path")
log.warn(s"path is $path")
listPidType(spark, path)
}
}
object CheckEBIStats {
val log: Logger = LoggerFactory.getLogger(getClass.getName)
def main(args: Array[String]): Unit = {
new CheckEBIStats(args,log).initialize().run()
}
}

View File

@ -2,11 +2,10 @@ package eu.dnetlib.scholix
import com.sandro.app.AbstractScalaApplication
import org.apache.spark.sql.SparkSession
import org.slf4j.{Logger, LoggerFactory}
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import org.apache.spark.sql.functions.{count,desc}
import org.slf4j.{Logger, LoggerFactory}
class CheckMDStoreContent( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
@ -20,7 +19,7 @@ class CheckMDStoreContent( args: Array[String], log: Logger) extends AbstractSc
val source = (json \ "source").extractOrElse[String](null)
if (source != null) {
val rel =(json \"relClass").extract[String]
s"Relation:$rel"
s"Relation"
}
else {
val l: List[String] = for {
@ -32,13 +31,55 @@ class CheckMDStoreContent( args: Array[String], log: Logger) extends AbstractSc
}
}
def filter_relationId(input:String):List[String] = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val source = (json \ "source").extractOrElse[String](null)
if (source != null) {
val target =(json \"target").extract[String]
List(source, target)
} else
List()
}
def filter_entity_id(input:String):(String, String) = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val result_type = (json \ "resulttype" \ "classid").extractOrElse[String](null)
val id = (json \ "id").extractOrElse[String](null)
if (id == null)
null
else
(id,result_type)
}
def show_typologies(spark:SparkSession, path:String): Unit = {
import spark.implicits._
val df = spark.read.text(path).as[String]
df.map(s =>get_type(s)).groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)
val id_rels = df.flatMap(s => filter_relationId(s))
.filter(s=>s.startsWith("unresolved::") && s.contains("pmid"))
.distinct()
log.warn(s"Total pubmed pubs imported in scholexplorer ${id_rels.count}")
// df.map(s =>filter_entity_id(s))
// .filter(s =>s!=null)
// .map(_._2)
// .groupBy("value").agg(count("value").alias("Total")).orderBy(desc("Total")).show(300, false)
// val id_datacite = df.map(s =>filter_entity_id(s))
// .filter(s =>s!=null)
// .filter(s => "publication".equalsIgnoreCase(s._2))
// .map(_._1)
// .distinct()
//
// val total_pubs = id_datacite.joinWith(id_rels, id_datacite("value").equalTo(id_rels("value")), "inner").count()
//
// log.warn(s"total doi rel in datacite : $total_pubs")
}

View File

@ -60,7 +60,7 @@ class CheckRelation( args: Array[String], log: Logger) extends AbstractScalaAppl
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => filterRelations(r))
.map(r => r.getSubRelType).as[String].groupBy("value")
.map(r => r.getRelClass).as[String].groupBy("value")
.agg(count("value").alias("Total"))
.orderBy(desc("Total"))
.show(300, truncate = false)

View File

@ -0,0 +1,96 @@
package eu.dnetlib.scholix
import com.fasterxml.jackson.databind.ObjectMapper
import com.sandro.app.AbstractScalaApplication
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.scholix.CheckRelation.logger
import org.apache.spark.sql.functions.count
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
class CheckSummaries ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
def filterRelations(r: Relation): Boolean = {
val relClassFilter = List(
"merges",
"isMergedIn",
"HasAmongTopNSimilarDocuments",
"IsAmongTopNSimilarDocuments"
)
if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
false
else {
if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0)
false
else if (r.getCollectedfrom.size() > 1)
true
else if (r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0)!=null && "OpenCitations".equalsIgnoreCase(r.getCollectedfrom.get(0).getValue))
false
else
true
}
}
def extractSourceTarget(input:String, path:String) :String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
(json \ path).extract[String]
}
def countSummaries(basePath:String, spark:SparkSession) :Unit = {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
implicit val oafEncoder: Encoder[OafEntity] = Encoders.kryo[OafEntity]
import spark.implicits._
val relPath = s"/tmp/beta_provision/scholix/relation"
val pubPath = s"$basePath/entities/publication"
// val ds:Dataset[ScholixSummary] = spark.read.load(path).as[ScholixSummary]
//
//
// ds.map(s => s.getTypology.toString).groupBy("value").agg(count("value").alias("Total")).show(300, truncate = false)
val mapper = new ObjectMapper()
val df =spark.read.load(relPath).as[Relation]
val totalIDS = df.flatMap(r=> List(r.getSource,r.getTarget))
.filter(s => s.startsWith("50"))
.distinct()
val pubId = spark.read.load(pubPath).as[OafEntity].map(o =>o.getId).distinct()
val idPubsTotal = pubId.joinWith(totalIDS, pubId("value").equalTo(totalIDS("value")), "inner").count()
log.warn(s"Total ids in input Relation of type publication $idPubsTotal")
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val path = argumentMap("path")
logger.warn(s"path properties is $path")
if (path == null || path.isEmpty)
throw new IllegalArgumentException("missing path arguments.properties -path when launch file, check if it is inside the arguments.properties")
countSummaries(path, spark)
}
}
object CheckSummaries {
val logger: Logger = LoggerFactory.getLogger(CheckRelation.getClass.getName)
def main(args: Array[String]): Unit = {
new CheckSummaries(args,logger).initialize().run()
}
}

View File

@ -0,0 +1,139 @@
package eu.dnetlib.scholix
import com.sandro.app.fs.{OAFInfo, OAFParser}
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue, Relation, Result}
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils.DATE_RELATION_KEY
import scala.io.Source
import scala.xml.pull.XMLEventReader
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import collection.JavaConverters._
case class Measurement(name:String, nsprefix:String, timestamp:Long, value:Long) {}
object DHPUtils {
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(
false,
null,
false,
false,
ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER,
"0.9"
)
val relations = Map(
"IsSupplementTo"->"IsSupplementedBy",
"IsSupplementedBy"->"IsSupplementTo",
"References"->"IsReferencedBy",
"IsReferencedBy"->"References",
"IsRelatedTo"->"IsRelatedTo" )
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
def createInverseRelationships(r:Relation): List[Relation] = {
val inverse = new Relation()
inverse.setDataInfo(r.getDataInfo)
inverse.setCollectedfrom(r.getCollectedfrom)
inverse.setProperties(r.getProperties)
inverse.setSource(r.getTarget)
inverse.setTarget(r.getSource)
inverse.setRelType(r.getRelType)
inverse.setSubRelType(r.getSubRelType)
inverse.setRelClass(relations.getOrElse(r.getRelClass, r.getRelClass))
List(r, inverse)
}
def extractPidMap(r:Result):List[(String, String)] = {
if (r == null || r.getInstance()==null)
return null
r.getInstance().asScala.filter(i => i.getPid!= null).flatMap(i =>i.getPid.asScala).map(p => (r.getId, generate_unresolved_id(p.getValue, p.getQualifier.getClassid))).toList
}
def extractIdRel(input:String):String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val relName: String = (json \ "RelationshipType" \ "Name").extract[String]
val sourcePid = (json \ "Source" \ "Identifier" \ "ID").extract[String]
val sourcePidType = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String]
val targetPid = (json \ "Target" \ "Identifier" \ "ID").extract[String]
val targetPidType = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String]
s"$sourcePid::$sourcePidType::$relName::$targetPid::$targetPidType".toLowerCase()
}
def eventDataToRelation(input:String):Relation = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val relName:String = (json \ "RelationshipType" \ "Name").extract[String]
val sourcePid = (json\ "Source" \ "Identifier"\ "ID").extract[String]
val sourcePidType = (json\ "Source" \ "Identifier"\ "IDScheme").extract[String]
val targetPid = (json \ "Target" \ "Identifier" \ "ID").extract[String]
val targetPidType = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String]
val date:String = (json\"LinkPublicationDate").extract[String]
createRelation(generate_unresolved_id(sourcePid, sourcePidType),generate_unresolved_id(targetPid, targetPidType), ElsevierCollectedFrom, "relationship",relName, date)
}
def createRelation(
sourceId: String,
targetId:String,
collectedFrom: KeyValue,
subRelType: String,
relClass: String,
date: String
): Relation = {
val rel = new Relation
rel.setCollectedfrom(List(ElsevierCollectedFrom).asJava)
rel.setDataInfo(DATA_INFO)
rel.setRelType(ModelConstants.RESULT_RESULT)
rel.setSubRelType(subRelType)
rel.setRelClass(relClass)
rel.setSource(sourceId)
rel.setTarget(targetId)
val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
rel.setCollectedfrom(List(collectedFrom).asJava)
rel
}
def generate_unresolved_id(pid: String, pidType: String): String = {
s"unresolved::${pid.toLowerCase()}::${pidType.toLowerCase()}"
}
def convertTOOAFStat(input: String): OAFInfo = {
val xml = new XMLEventReader(Source.fromString(input))
val parser = new OAFParser(xml)
parser.extractStats()
}
}

View File

@ -0,0 +1,236 @@
package eu.dnetlib.scholix
import com.fasterxml.jackson.databind.ObjectMapper
import com.sandro.app.AbstractScalaApplication
import eu.dnetlib.dhp.schema.oaf.{Relation, Result}
import eu.dnetlib.dhp.schema.sx.scholix.Scholix
import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary
import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils
import eu.dnetlib.doiboost.crossref.CrossrefDT
import org.apache.commons.cli.MissingArgumentException
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.sql.functions.count
class GenerateEventDataRelations ( args: Array[String], log: Logger) extends AbstractScalaApplication( args: Array[String], log: Logger) {
private def convertEventDataToRelations(spark:SparkSession, sourcePath:String, relation_path:String): Unit = {
implicit val relEncoders:Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
spark.read.load(sourcePath)
.select("json")
.map(r => r.getString(0))
.map(r=>DHPUtils.eventDataToRelation(r))
.write.mode(SaveMode.Overwrite).save(relation_path)
}
private def resolveRelations(spark:SparkSession, workingPath:String): Unit = {
val entityPath ="/tmp/beta_provision/scholix/entities/*"
implicit val resultEncoder:Encoder[Result]= Encoders.kryo[Result]
implicit val relEncoders:Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val df =spark.read.load(entityPath).as[Result]
df.filter(r =>r != null && r.getDataInfo!= null && false == r.getDataInfo.getDeletedbyinference)
.flatMap(r =>DHPUtils.extractPidMap(r))
.write.mode(SaveMode.Overwrite)
.save(s"$workingPath/pidMap")
val pidMap: Dataset[(String, String)] = spark.read.load(s"$workingPath/pidMap").as[(String, String)]
val unresolvedSourceRelation: Dataset[(String,Relation)] = spark.read.load(s"$workingPath/relations").as[Relation].map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING,relEncoders))
unresolvedSourceRelation.joinWith(pidMap, unresolvedSourceRelation("_1").equalTo(pidMap("_2")), "leftouter")
.map(k =>{
if (k._2 == null)
null
else {
val rel = k._1._2
val pid = k._2._1
rel.setSource(pid)
rel
}
}).as[Relation].filter(r => !r.getSource.startsWith("unresolved")).write.mode(SaveMode.Overwrite).save(s"$workingPath/relSourceResolved")
val rsolved: Dataset[(String,Relation)] =spark.read.load(s"$workingPath/relSourceResolved").as[Relation].map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING,relEncoders))
rsolved.joinWith(pidMap, rsolved("_1").equalTo(pidMap("_2")), "leftouter")
.map(k => {
if (k._2 == null)
null
else {
val rel = k._1._2
val pid = k._2._1
rel.setTarget(pid)
rel
}
}).as[Relation].filter(r => !r.getTarget.startsWith("unresolved")).flatMap(r=>DHPUtils.createInverseRelationships(r)).write.mode(SaveMode.Overwrite).save(s"$workingPath/relResolved")
val totRels = unresolvedSourceRelation.count()
val resolved = spark.read.load(s"$workingPath/relResolved").as[Relation]
val totResolved = resolved.count()
println(s"RESOLVED $totResolved/$totRels")
}
private def serializeScholix(spark:SparkSession, workingPath:String):Unit = {
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
val scholix = spark.read.load(s"$workingPath/scholix").as[Scholix]
val mapper = new ObjectMapper()
import spark.implicits._
scholix.map(s => mapper.writeValueAsString(s)).write.mode(SaveMode.Overwrite).text(s"$workingPath/scholix_json")
}
private def checkRelations(spark:SparkSession, workingPath:String, rel_path:String): Unit = {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val relEncoders: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val rels = spark.read.load(rel_path)
val scholix = spark.read.load(s"$workingPath/scholix").as[Scholix]
println(scholix.count())
scholix.map(s => s.getRelationship.getName).groupBy("value").agg(count("value").alias("Total")).show()
println(rels.count())
println( rels.select("json").map(r=>DHPUtils.extractIdRel(r.getString(0))).distinct().count())
}
private def createScholix(spark:SparkSession, workingPath:String, summaryPath:String):Unit = {
implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary]
implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
implicit val relEncoders: Encoder[Relation] = Encoders.kryo[Relation]
import spark.implicits._
val summaryDS =spark.read.load("/tmp/beta_provision/scholix/provision/summaries").as[ScholixSummary]
.map(s => (s.getId,s))(Encoders.tuple(Encoders.STRING, summaryEncoder))
val relationDS = spark.read.load(s"$workingPath/relResolved").as[Relation].map(r => (r.getSource,r))(Encoders.tuple(Encoders.STRING, relEncoders))
relationDS
.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Relation), (String, ScholixSummary)) =>
if (input._1 != null && input._2 != null) {
val rel: Relation = input._1._2
val source: ScholixSummary = input._2._2
(rel.getTarget, ScholixUtils.scholixFromSource(rel, source))
} else null
}(Encoders.tuple(Encoders.STRING, scholixEncoder))
.filter(r => r != null)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix_from_source")
val scholixSource: Dataset[(String, Scholix)] = spark.read
.load(s"$workingPath/scholix_from_source")
.as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder))
scholixSource
.joinWith(summaryDS, scholixSource("_1").equalTo(summaryDS("_1")), "left")
.map { input: ((String, Scholix), (String, ScholixSummary)) =>
if (input._2 == null) {
null
} else {
val s: Scholix = input._1._2
val target: ScholixSummary = input._2._2
ScholixUtils.generateCompleteScholix(s, target)
}
}
.filter(s => s != null)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix_one_verse")
val scholix_o_v: Dataset[Scholix] =
spark.read.load(s"$workingPath/scholix_one_verse").as[Scholix]
scholix_o_v
.flatMap(s => List(s, ScholixUtils.createInverseScholixRelation(s)))
.as[Scholix]
.map(s => (s.getIdentifier, s))(Encoders.tuple(Encoders.STRING, scholixEncoder))
.groupByKey(_._1)
.agg(ScholixUtils.scholixAggregator.toColumn)
.map(s => s._2)
.write
.mode(SaveMode.Overwrite)
.save(s"$workingPath/scholix")
val scholix_final: Dataset[Scholix] = spark.read.load(s"$workingPath/scholix").as[Scholix]
println(scholix_final.count())
}
private def checkCrossrefDOI(spark:SparkSession):Unit ={
implicit val mrEncoder: Encoder[CrossrefDT] = Encoders.kryo[CrossrefDT]
import spark.implicits._
// val df = spark.read.load("/data/doiboost/input/crossref/crossref_ds").as[CrossrefDT]
// val tot = df.filter(d=> d.doi.equalsIgnoreCase("10.1107/s2052252521010563/yc50352sup3.hkl")).count()
// println(s"Found $tot")
val tot2 = spark.read.text("/tmp/beta_provision/graph/19_graph_blacklisted/*").as[String].filter(s=> s.contains("10.1107/s2052252521010563/yc50352sup3.hkl")).count()
println(s"Found in the final graph $tot2")
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val source_path: String = argumentMap.get("source_path").orNull
val working_path : String = argumentMap.get("working_path").orNull
if (source_path == null) throw new MissingArgumentException("Missing argument path")
if (working_path == null) throw new MissingArgumentException("Missing argument path")
// convertEventDataToRelations(spark, source_path, s"$working_path/relations")
// resolveRelations(spark, working_path )
// checkRelations(spark, working_path, source_path)
//createScholix(spark, working_path, "/tmp/beta_provision/scholix/provision/summaries")
// serializeScholix(spark, working_path)
checkCrossrefDOI(spark)
spark.close()
}
}
object GenerateEventDataRelations {
val log: Logger = LoggerFactory.getLogger(getClass.getName)
def main(args: Array[String]): Unit = {
new GenerateEventDataRelations(args,log).initialize().run()
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,40 @@
package com.sandro.app;
import com.sandro.app.fs.MDStoreInfo;
import org.junit.jupiter.api.Test;
public class FSCheckTest {
private MDStoreInfo extractPath(final String path, final String basePath) {
int res = path.indexOf(basePath);
if (res >0){
String[] split = path.substring(res).split("/");
if (split.length > 2) {
final String ts = split[split.length -1];
final String mdStore = split[split.length -2];
return new MDStoreInfo(mdStore, null, Long.parseLong(ts));
}
}
return null;
}
@Test
public void doTest() {
final String basePath = "/user/sandro.labruzzo/stores/";
final String path = "hdfs://nameservice1/user/sandro.labruzzo/stores/4a0cddf2-20e9-4558-a3c1-4d20cfecffa8_TURTdG9yZURTUmVzb3VyY2VzL01EU3RvcmVEU1Jlc291cmNlVHlwZQ==/1592574025511";
System.out.println(extractPath(path,basePath));
}
}

View File

@ -0,0 +1,23 @@
package com.sandro.app
import com.sandro.app.fs.OAFParser
import org.junit.jupiter.api.Test
import scala.io.Source
import scala.xml.pull.XMLEventReader
class xmlParser {
@Test
def testParse(): Unit = {
val xml =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/store/odf.xml")).mkString
val xml_e = new XMLEventReader(Source.fromString(xml))
val parser = new OAFParser(xml_e)
println(parser.extractStats())
}
}

View File

@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:oaf="http://namespace.openaire.eu/oaf">
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>rep______umk::000b9ae2fe88cd81444bdc4b56cd39fe</dri:objIdentifier>
<dri:recordIdentifier>002e31dc-3a44-4a56-a02c-5f60cfa13c9e_TURTdG9yZURTUmVzb3VyY2VzL01EU3RvcmVEU1Jlc291cmNlVHlwZQ==::oai:repozytorium.umk.pl:item/1440</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId>7f1d17f1-4d1d-4df5-bc55-003f203ac6b5_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
<dr:objectIdentifier/>
<dr:dateOfCollection>2014-07-17T10:09:47Z</dr:dateOfCollection>
<oaf:datasourceprefix>rep______umk</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:creator>Perlik, Kamila</dc:creator>
<dc:dateAccepted>2011-12-08</dc:dateAccepted>
<dc:description>Artykuł relacjonuje przebieg odbywającej się na początku kwietnia 2011 r. piątej konferencji naukowej Instytutu Informacji Naukowej i Studiów Bibliologicznych Uniwersytetu Warszawskiego. Celem spotkania była diagnoza zmian zachodzących w nauce o informacji i jej polu badawczym oraz w praktyce działalności informacyjnej, wyodrębnienie najważniejszych współczesnych kierunków rozwoju w obu tych płaszczyznach oraz ocena zakresu ich reprezentacji w edukacji specjalistów informacji. Główne obszary dyskusji objęły m.in. teoretyczne podstawy nauki o informacji, metodologię badań w nauce o informacji, zarządzanie informacją i wiedzą, badania użytkowników i użytkowania informacji, społeczną recepcję technologii informacyjnych i rolę ICT w życiu społecznym, prawne i etyczne aspekty działalności informacyjnej, nowe role bibliotekarzy, kształcenie specjalistów informacji, ilościowe badania piśmiennictwa naukowego (webometria, infometria, bibliometria) oraz kwestie tworzenia bibliotek cyfrowych, repozytoriów i elektronicznego publikowania.</dc:description>
<dc:identifier>http://repozytorium.umk.pl/handle/item/1440</dc:identifier>
<dc:language>pol</dc:language>
<dc:title>5. Konferencja Naukowa Instytutu Informacji Naukowej i Studiów Bibliologicznych Uniwersytetu Warszawskiego „Nauka o informacji (informacja naukowa) w okresie zmian” (Warszawa, 45 kwietnia 2011 r.)</dc:title>
<dc:subject>biblioteki cyfrowe</dc:subject>
<dc:subject>repozytoria</dc:subject>
<dc:subject>metodologia bedań</dc:subject>
<dc:subject>prawo</dc:subject>
<dc:subject>etyka</dc:subject>
<dr:CobjCategory>0001</dr:CobjCategory>
<dr:CobjIdentifier>Toruńskie Studia Bibliologiczne, No. 2 (7), Vol. 4, pp. 165-169</dr:CobjIdentifier>
<dr:CobjIdentifier>2080-1807</dr:CobjIdentifier>
<dr:CobjIdentifier>doi:10.12775/TSB.2011.026</dr:CobjIdentifier>
<oaf:collectedDatasourceid>driver______::d6a8dc01-db12-48d7-be88-9919f1c912c6</oaf:collectedDatasourceid>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy name="Repozytorium Uniwersytetu Mikołaja Kopernika" id="driver______::d6a8dc01-db12-48d7-be88-9919f1c912c6"/>
<oaf:hostedBy name="Repozytorium Uniwersytetu Mikołaja Kopernika" id="driver______::d6a8dc01-db12-48d7-be88-9919f1c912c6"/>
<oaf:identifier identifierType="doi">10.12775/TSB.2011.026</oaf:identifier>
</metadata>
</record>

View File

@ -0,0 +1,80 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<oai:header xmlns="http://namespace.openaire.eu/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<dri:objIdentifier>od______2659::0000170f23c42f3712d9892367b217fe</dri:objIdentifier>
<dri:recordIdentifier>oai:zenodo.org:582984</dri:recordIdentifier>
<dri:dateOfCollection>2020-01-26T00:05:19.414Z</dri:dateOfCollection>
<oaf:datasourceprefix>od______2659</oaf:datasourceprefix>
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">oai:zenodo.org:582984</identifier>
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2020-01-21T07:23:17Z</datestamp>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">openaire_data</setSpec>
<setSpec xmlns="http://www.openarchives.org/OAI/2.0/">user-powertac</setSpec>
</oai:header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-3"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<identifier identifierType="DOI">10.5281/zenodo.582984</identifier>
<creators>
<creator>
<creatorName>PowerTAC</creatorName>
</creator>
</creators>
<titles>
<title>PowerTAC 2016-06 Finals Game 64</title>
</titles>
<publisher>Zenodo</publisher>
<publicationYear>2016</publicationYear>
<subjects>
<subject>PowerTAC</subject>
</subjects>
<dates>
<date dateType="Issued">2016-06-07</date>
</dates>
<resourceType resourceTypeGeneral="Dataset"/>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="URL" relationType="IsPartOf">https://zenodo.org/communities/powertac</relatedIdentifier>
</relatedIdentifiers>
<rightsList>
<rights rightsURI="http://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</rights>
<rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
</rightsList>
<descriptions>
<description descriptionType="Abstract">Log and boot files of game 64</description>
</descriptions>
</resource>
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
<oaf:dateAccepted>2016-01-01</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:language>und</oaf:language>
<oaf:concept id="https://zenodo.org/communities/powertac"/>
<oaf:hostedBy id="opendoar____::2659" name="ZENODO"/>
<oaf:collectedFrom id="opendoar____::2659" name="ZENODO"/>
</metadata>
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2020-01-26T00:05:19.414Z">
<baseURL>https%3A%2F%2Fzenodo.org%2Foai2d</baseURL>
<identifier>oai:zenodo.org:582984</identifier>
<datestamp>2020-01-21T07:23:17Z</datestamp>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
classname="sysimport:crosswalk:repository"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>