From 264723ffd80d13236207fb31a7661145c3c464ed Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 23 Nov 2020 14:41:38 +0100 Subject: [PATCH] updated stuff for zenodo upload --- .../eu/dnetlib/dhp/doiboost/QueryTest.scala | 37 +++-- .../eu/dnetlib/dhp/export/zenodo/MakeTar.java | 127 +++++++++--------- .../dhp/export/zenodo/SendToZenodoHDFS.java | 2 +- .../dnetlib/sx/zenodo/oozie_app/workflow.xml | 36 ++--- 4 files changed, 107 insertions(+), 95 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala index 61c1f5111..698c8cc79 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/dhp/doiboost/QueryTest.scala @@ -42,22 +42,33 @@ class QueryTest { } - def myQuery(spark:SparkSession, sc:SparkContext): Unit = { - implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] - - - val mapper = new ObjectMapper() - mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) - - - val ds:Dataset[Publication] = spark.read.load("/tmp/p").as[Publication] - - - - ds.filter(p =>p.getBestaccessright!= null && p.getBestaccessright.getClassname.nonEmpty).count() + def extractId(input:String):String = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input) + (json \ "id").extractOrElse[String](null) } + + def myQuery(spark:SparkSession, sc:SparkContext): Unit = { + implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] + val mapper = new ObjectMapper() + mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) + + val ds:Dataset[Publication] = spark.read.load("/tmp/p").as[Publication] + + + val sc = spark.sparkContext + + + ds.filter(p =>p.getBestaccessright!= null && p.getBestaccessright.getClassname.nonEmpty).count() + val typologies =List("dataset","datasource","organization","otherresearchproduct","project","publication","software") + val basePath ="/tt" + + typologies.map(tp => sc.textFile(s"$basePath/dataset").map(s =>extractId(tp) ).distinct.count()).sum() + + } + } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java index 95bea74a2..e19432f29 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/MakeTar.java @@ -12,6 +12,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.MakeTarArchive; public class MakeTar implements Serializable { @@ -41,71 +42,71 @@ public class MakeTar implements Serializable { FileSystem fileSystem = FileSystem.get(conf); - makeTArArchive(fileSystem, inputPath, outputPath); + MakeTarArchive.tarMaxSize(fileSystem, inputPath, outputPath, "scholix_dump", 25); } - public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException { - - RemoteIterator dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath)); - - while (dir_iterator.hasNext()) { - LocatedFileStatus fileStatus = dir_iterator.next(); - - Path p = fileStatus.getPath(); - String p_string = p.toString(); - String entity = p_string.substring(p_string.lastIndexOf("/") + 1); - - write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity); - } - - } - - private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name) - throws IOException { - - Path hdfsWritePath = new Path(outputPath); - FSDataOutputStream fsDataOutputStream = null; - if (fileSystem.exists(hdfsWritePath)) { - fileSystem.delete(hdfsWritePath, true); - - } - fsDataOutputStream = fileSystem.create(hdfsWritePath); - - TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream()); - - RemoteIterator fileStatusListIterator = fileSystem - .listFiles( - new Path(inputPath), true); - - while (fileStatusListIterator.hasNext()) { - LocatedFileStatus fileStatus = fileStatusListIterator.next(); - - Path p = fileStatus.getPath(); - String p_string = p.toString(); - if (!p_string.endsWith("_SUCCESS")) { - String name = p_string.substring(p_string.lastIndexOf("/") + 1); - TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz"); - entry.setSize(fileStatus.getLen()); - ar.putArchiveEntry(entry); - - InputStream is = fileSystem.open(fileStatus.getPath()); - - BufferedInputStream bis = new BufferedInputStream(is); - - int count; - byte data[] = new byte[1024]; - while ((count = bis.read(data, 0, data.length)) != -1) { - ar.write(data, 0, count); - } - bis.close(); - ar.closeArchiveEntry(); - - } - - } - - ar.close(); - } +// public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException { +// +// RemoteIterator dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath)); +// +// while (dir_iterator.hasNext()) { +// LocatedFileStatus fileStatus = dir_iterator.next(); +// +// Path p = fileStatus.getPath(); +// String p_string = p.toString(); +// String entity = p_string.substring(p_string.lastIndexOf("/") + 1); +// +// write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity); +// } +// +// } +// +// private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name) +// throws IOException { +// +// Path hdfsWritePath = new Path(outputPath); +// FSDataOutputStream fsDataOutputStream = null; +// if (fileSystem.exists(hdfsWritePath)) { +// fileSystem.delete(hdfsWritePath, true); +// +// } +// fsDataOutputStream = fileSystem.create(hdfsWritePath); +// +// TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream()); +// +// RemoteIterator fileStatusListIterator = fileSystem +// .listFiles( +// new Path(inputPath), true); +// +// while (fileStatusListIterator.hasNext()) { +// LocatedFileStatus fileStatus = fileStatusListIterator.next(); +// +// Path p = fileStatus.getPath(); +// String p_string = p.toString(); +// if (!p_string.endsWith("_SUCCESS")) { +// String name = p_string.substring(p_string.lastIndexOf("/") + 1); +// TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz"); +// entry.setSize(fileStatus.getLen()); +// ar.putArchiveEntry(entry); +// +// InputStream is = fileSystem.open(fileStatus.getPath()); +// +// BufferedInputStream bis = new BufferedInputStream(is); +// +// int count; +// byte data[] = new byte[1024]; +// while ((count = bis.read(data, 0, data.length)) != -1) { +// ar.write(data, 0, count); +// } +// bis.close(); +// ar.closeArchiveEntry(); +// +// } +// +// } +// +// ar.close(); +// } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java index 1dcbf6ccc..2e2b7bc26 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/export/zenodo/SendToZenodoHDFS.java @@ -73,7 +73,7 @@ public class SendToZenodoHDFS implements Serializable { } zenodoApiClient.sendMretadata(metadata); - zenodoApiClient.publish(); +// zenodoApiClient.publish(); } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml index 6d7056503..fd8c773c9 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/resources/eu/dnetlib/sx/zenodo/oozie_app/workflow.xml @@ -8,10 +8,10 @@ targetPath the target path - - metadata - the metadata - + + + + @@ -34,20 +34,20 @@ - - - eu.dnetlib.dhp.export.zenodo.SendToZenodoHDFS - --hdfsPath/user/dnet.scholexplorer/scholix/provision/scholix.tar/scholix-2020-10-16.tar - --nameNode${nameNode} - --accessTokenb6ddrY6b77WxcDEevn9gqVE5sL5sDNjdUijt75W3o7cQo5vpFFI48dMiu8Gv - --connectionUrlhttps://zenodo.org/api/deposit/depositions - --metadata${metadata} - --conceptRecordId1200252 - --newDepositionfalse - - - - + + + + + + + + + + + + + + \ No newline at end of file