From c58e0d99103dff3c0b5f0700a284a237a47d90e1 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 7 Oct 2022 16:53:59 +0200 Subject: [PATCH 1/3] [Enrichment - Subject Propagation] propagation of subjects from result to result --- .../null/publication/._SUCCESS.crc | Bin 0 -> 8 bytes ...cb-470f-b7ad-68b08e2882ec-c000.json.gz.crc | Bin 0 -> 48 bytes .../dhp-enrichment/null/publication/_SUCCESS | 0 ...2-40cb-470f-b7ad-68b08e2882ec-c000.json.gz | Bin 0 -> 4714 bytes .../eu/dnetlib/dhp/PropagationConstant.java | 3 + .../PrepareResultResultStep1.java | 156 ++++++++ .../ResultSubjectList.java | 34 ++ .../SparkSubjectPropagationStep2.java | 180 ++++++++++ .../SubjectInfo.java | 46 +++ .../dhp/subjecttoresultfromsemrel/Utils.java | 17 + ...put_preparesubjecttoresult_parameters.json | 46 +++ .../input_propagatesubject_parameters.json | 45 +++ .../oozie_app/config-default.xml | 63 ++++ .../subjectpropagation/oozie_app/workflow.xml | 307 ++++++++++++++++ .../SubjectPreparationJobTest.java | 159 +++++++++ .../SubjectPropagationJobTest.java | 333 ++++++++++++++++++ .../preparedInfo/dataset/dataset.json | 0 .../otherresearchproduct/orp.json | 0 .../preparedInfo/publication/preparedInfo | 2 + .../preparedInfo/software/software.json | 0 .../publication/publication.json | 4 + .../subjectpropagation/relation/relation.json | 10 + 22 files changed, 1405 insertions(+) create mode 100644 dhp-workflows/dhp-enrichment/null/publication/._SUCCESS.crc create mode 100644 dhp-workflows/dhp-enrichment/null/publication/.part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz.crc create mode 100644 dhp-workflows/dhp-enrichment/null/publication/_SUCCESS create mode 100644 dhp-workflows/dhp-enrichment/null/publication/part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/PrepareResultResultStep1.java create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/ResultSubjectList.java create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SparkSubjectPropagationStep2.java create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SubjectInfo.java create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/Utils.java create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPreparationJobTest.java create mode 100644 dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPropagationJobTest.java create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/dataset/dataset.json create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/otherresearchproduct/orp.json create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/publication/preparedInfo create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/software/software.json create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/publication/publication.json create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/relation/relation.json diff --git a/dhp-workflows/dhp-enrichment/null/publication/._SUCCESS.crc b/dhp-workflows/dhp-enrichment/null/publication/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/dhp-workflows/dhp-enrichment/null/publication/.part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz.crc b/dhp-workflows/dhp-enrichment/null/publication/.part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..24a358fe2ef5e6c14d29676fc594795138aeccce GIT binary patch literal 48 zcmYc;N@ieSU}8whoSoM(DO0-d`{FFFT_+eH+6y_Crp+l;xVfQhbNf}z8FtSFy~G9i E0lCl;(*OVf literal 0 HcmV?d00001 diff --git a/dhp-workflows/dhp-enrichment/null/publication/_SUCCESS b/dhp-workflows/dhp-enrichment/null/publication/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-enrichment/null/publication/part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz b/dhp-workflows/dhp-enrichment/null/publication/part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..768f68d92df8f70eedbd68f24fe821073db26d3b GIT binary patch literal 4714 zcmV-w5|!;AiwFP!000000PS7fZrjL`zTc;42tJzxE6F7FFJLabc7BW#$42Ct0ZvX3 zw8*B^G0En1H|2P80_^2tZ}$cEVi!1hfqenro~PI+*{W_*qJAvf6MGV87zDORc6U{M zRbBnVPwkI2n(3*Kwc(qOwJ?jKgvP8kti5D2ngpy)PG}a@>b3lKewFe7D!i@NBu&n0 zL$_X2nOv?}`vi~2Olj)-Oex7HQ(YT=toadD3ai(SPoBPlw#<#Z?s3WzvS%v56Xj1? z%uLxJVR~p=dKZg;Egt7J*9&}trbQGnU$Y>TA~qJDu?r~dI)4)&iE_!_!N1|K+xOaI zZ_xJ#-7y_FUD{$@+H*RC77fF3*a2xX8f7TxY08z)S>mIJwsx==h$h15e|3Z>3zbT*VM6^et37c1z$pmGKd199uvFU0H_F`FH*UJAURWu^e1 zn$sNMWdY>A5HjEi#d4&CD&aKFk<9?jivrmhlXSxDRfX$1u7C*>A$EaOTT83>+ZKPG ztLl+}>?R48Oj*UNj((CrVsZqx0W z3QYNur?y;`7Ph+OXGYiu8uNq^J!NF)mml060s{)!(LXplM5WB2-RqsCdd$1b3hZNDi6%|NiYivovM00hctHWD#9F zBj;RC34;bM$W(|kqOy1)SR~*l@ce|&7-->(c#I>1kmza-1FTFX)3gB3n3`j;Nl!py z5jO@SQ^gn1X;dnb083*8-6Vj>`^28_)9Zvwl%FmPn zBcL~#E{PREOfG3hlOG7gv5^i!6o;ObXbX>?a<$cXBL`~vQ5G;%Wek1}ArNx0Xv18t zWFhn-=!yD99fg47O#?VtvZ?@Dj7ooMaTK5nHZ0tiH& zSZhD+3{lf51>~Vk2)^Ryg6Zw6;UR=NBPxj6ff zcQ(igHb+BfMzkDRfyqTh$OSgz!e{HtCfR3Nv!x2qGGq@%e#uN*>hd!LI@T;QTq-h2 zqDRc8HWZMjG^nA9JO`=)Lge!EghGF~{4Z>((1T1aKPS+v%};sayZj8@9Xbh(FaNF~ zdWtlSrkO1L1VY%zO%#iUbEHx@-coD!F$7n6U2(02yDH?M58Gb_elNqciL6;_hm%SeuE&$y%jv7V_2tP<}C6)NS>#C zG!auTWny^A98VLiy z<=s2{O=!84gr+o$I=W3 zH2^KowWI`485hKLuAr%Tl{J_!6!s&PMWD69=?(@Z_-ss}fUy9P#Ox10t>d#~p+Thc zcy5}NCRoKJOp9K#d9HtBZ_9i@z)L&V&p*RxvRL+;#M^QH=5+laXoX?B=QzXRpw;gT z#(~#!J=bqB*7LiJwa4Mu?F}4%OwEX)^0gYQ3clh-35^T@njHoJ84o$D_Ajm)5pm9% zBUIKj5PU_Q13uvzrf|n&jA>GWc7u96c&9JYE8MksS{jjEr`hg$gZ^$1YhN|JLEFi< zO_nyj?tIUy+G+U0od%fkn>|9P6$!uCJpd_Fp$u&I;q$Y=QyC)GCq4YQEoyf z8%NwK{SL-j7>|9n8K6xCKb5zRO|nIguPEcJDD&g1KfgNu`PBwxB*V4Hy9kz*(PlR- zmjLmZj6i&Cs&%S{yLsPX3#ghxPIgUSdW)SY!dZ~nb}TE&lF5pWPm@W+)pUc7MGKqt zTRi=Q!j#TZ>|L72xN{z-wV~VX^aq1>tKD}8PQ4~5FL#}fHCQH2cydtQVftwEIYJ8& z2|Z^FORFDWzXYMi?mxBm{^UJ_|IpZ~?Ql2W;mYNgqZdNdcANuvzHNt$mxd8kdAK5F zXmB4Ioa?n1)_0gcUI8#>#+xe&{y9HCZ<-zeYdk*7-E+Kmxa&2YU1<1S;P2Kx0d^J! z8XaPE{id<_8a3Z8g>Hl|IdouPuy<^PulKG+^e)A8a+ZkmWJA0l5c^!S?Y(Qg)F-0^ z%xuOcQ0%YM;;N03BMwQ*Gqvaxl+{}V*AK&I2yeg61UauH2O}*sdgc* z!Rr9JrvyaE5!KKR1g@{hKjRLsfZGts5!n}U3d=D`Mz$kYm|lbF)8(93;dKpOi!p&T z$ZrV9I$4aUb#icYx<4W(Qur_x;>kpjnTVQX?-+`;WIzrRyE5Zu+kRgZh~wtRGhC2H zvqc@H&FMY2^&kweUx47C4*j%pPh{M_Ko-_W~rbuvoxvF^zD{J1Qz!`Sqc9)Hg?ar!SiLqp^Sdc4->$27XW9 z?kF% z^31b2d1k*vo`H4OkB0nox*$Its~#-==zQ!|63V z=cc@E&uzAE%-i<7VasWH{eDHQmf}Uk zK5%_eC#ggNSqtWEEte5oFvb3B3H};L!&my5Y`3;B~biO6Au1?IfG?Bqc~%zwIyT}!okQh zjUrxzB`}Maf|3TxRPvakAj?>XX(!qRP@2GwsHKt#k|hC5VuD=zCK;s&Vt}mSoM#Mf zl`um%XC&o`;d)t?bO9TJkH$+o*2MgXmkhe>vV3x`Ks^0LKLf+6<9^Upcr> z`k){5damntJ3+_o_dA2o4f-&f?K01AcY;B`R?!C^;&A!R(Fdd0R}HuKxHW7&!MRmM zBK)S}<%(%Tl)ANc)nr@JusF~KAf6?+zw}c2^~&qkz5LCiWkc*0 zY=}{r4KXroh^?%^pQ#M$7wz{oo)2MQY#fluZ@JRv(r?~)xP=U|m2i^uQ; zhU17)WX5sJAA<=Qwo9ZCcnoCD@)?6W^G1s^EzLockCT6KpX?|NwH1@7e3Lx=CF7v3 zId0Xe2h!KYnud_I;OAA$AvN&Z?<1c(h9`$H(8WYU;woJ3sohOb{Sxlqo!~kC8CSe4Q1HyhE#6 zYL4SPxAZ6EW*Ys=4de(J)048t6!QPLUXh0;8!g5ekF-5DmL~yAk)nq*PI#J3K!M{7 znT7b`PZ?5Z0LsrSk!vgr*AY*2gSnN$H3d%IX+on5yeP3XbM4F{#m)5`O>1RpPJYAh zWEjdJXc_TeS^20{Um@oZ_lQ(|`zg5aJf#{5PxIicigNZ%MamTur;2j+?NH7(pIc}8(l>T)U1#97ystwy zn@8{U=hn?9{5L$g&LI7}FwSl;8egMkMLDY|XZJ-pt4^-FAzszVbrl8eYvZ*T6Mp;2 zb;X!^^~rSy^{t22?Uh#Pd-GL#MQ8i%E4WR^(S2Vpxc^xG{013J^X;ek9%OYU-NW~4 zc^|iO=Yx0W=H2Ydu@By{tFId>mp*ux(o75U*Q)HhA_8SD5)pG6RnC3z&Mlu0IlA(E zNPGc20vpZwar1+_^YxHDDRNYu68e2b!&f>jw2FbR9Rt-V!GB!eFZ%!f^Q-#*^0&X9 zGZ8Z_FH8o2N)<)?0a3)u8=NGYJmF8(kyc9fjBnUnCZV{=!G7X1nKI%SbM2v^zt$-*jDXosoQpw82)V!@8ZW*BSTw!=cmewtKz6_wX+j`kl~i zcUo?@OKEQ$j(Z(eyD4q(cgj4Qxr4VLqTOXis`}6u_MtB!r`@e1w-VOcE0~qvjoO!= z-#v8=a*=Azv~GrcmnFL;rPXfRb6Z_Db{gG5H)yo`UAHmr^m~ogxZQDSyGsXddmF(m zhSh%xhFQRfM@aAFoPFHv(%ng5wu_ctlPCsE~dj?*UXD%E804%m0RTDhu!^WG>;^L{b`Z1#=lnE+i}}&+s#ps*@Lxad~pa;UnZdJi-`0 sd4pZ2vkSr+q~+uvn-$;Yd*<6zyG0LXw`fD}H# resultClazz = (Class) Class.forName(resultClassName); + + final String resultType = parser.get("resultType"); + log.info("resultType: {}", resultType); + + final List subjectClassList = Arrays + .asList( + parser.get("subjectlist").split(";")) + .stream() + .map(s -> s.toLowerCase()) + .collect(Collectors.toList()); + log.info("subjectClassList: {}", subjectClassList); + + final List allowedSemRel = Arrays + .asList( + parser.get("allowedSemRel").split(";")) + .stream() + .map(s -> s.toLowerCase()) + .collect(Collectors.toList()); + log.info("allowedSemRel: {}", allowedSemRel); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + prepareInfo(spark, inputPath, outputPath, subjectClassList, allowedSemRel, resultClazz, resultType); + }); + } + + private static void prepareInfo(SparkSession spark, + String inputPath, + String outputPath, + List subjectClassList, + List allowedSemRel, + Class resultClazz, + String resultType) { + + Dataset result = readPath(spark, inputPath + "/" + resultType, resultClazz) + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + !r.getDataInfo().getInvisible() && + r + .getSubject() + .stream() + .anyMatch(s -> subjectClassList.contains(s.getQualifier().getClassid().toLowerCase()))); + + Dataset relation = readPath(spark, inputPath + "/relation", Relation.class) + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + allowedSemRel.contains(r.getRelClass().toLowerCase())); + + result + .joinWith(relation, result.col("id").equalTo(relation.col("source")), "right") + .groupByKey((MapFunction, String>) t2 -> t2._2().getTarget(), Encoders.STRING()) + .mapGroups( + (MapGroupsFunction, ResultSubjectList>) (k, + it) -> getResultSubjectList(subjectClassList, k, it), + Encoders.bean(ResultSubjectList.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/" + resultType); + } + + @NotNull + private static ResultSubjectList getResultSubjectList(List subjectClassList, String k, + Iterator> it) { + ResultSubjectList rsl = new ResultSubjectList(); + rsl.setResId(k); + Tuple2 first = it.next(); + List sbjInfo = new ArrayList<>(); + Set subjectSet = new HashSet<>(); + extracted(subjectClassList, first._1().getSubject(), sbjInfo, subjectSet); + it.forEachRemaining(t2 -> extracted(subjectClassList, t2._1().getSubject(), sbjInfo, subjectSet)); + rsl.setSubjectList(sbjInfo); + return rsl; + } + + private static void extracted(List subjectClassList, List resultSubject, + List sbjList, Set subjectSet) { + + resultSubject + .stream() + .filter(s -> subjectClassList.contains(s.getQualifier().getClassid().toLowerCase())) + .forEach(s -> { + if (!subjectSet.contains(s.getValue())) + sbjList + .add( + SubjectInfo + .newInstance( + s.getQualifier().getClassid(), s.getQualifier().getClassname(), s.getValue())); + subjectSet.add(s.getValue()); + }); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/ResultSubjectList.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/ResultSubjectList.java new file mode 100644 index 000000000..b7c61e15e --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/ResultSubjectList.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.subjecttoresultfromsemrel; + +import java.io.Serializable; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import eu.dnetlib.dhp.schema.oaf.Subject; + +/** + * @author miriam.baglioni + * @Date 04/10/22 + */ +public class ResultSubjectList implements Serializable { + private String resId; + List subjectList; + + public String getResId() { + return resId; + } + + public void setResId(String resId) { + this.resId = resId; + } + + public List getSubjectList() { + return subjectList; + } + + public void setSubjectList(List subjectList) { + this.subjectList = subjectList; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SparkSubjectPropagationStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SparkSubjectPropagationStep2.java new file mode 100644 index 000000000..d546a8d8f --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SparkSubjectPropagationStep2.java @@ -0,0 +1,180 @@ + +package eu.dnetlib.dhp.subjecttoresultfromsemrel; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Subject; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import scala.Tuple2; + +/** + * @author miriam.baglioni + * @Date 05/10/22 + */ +public class SparkSubjectPropagationStep2 implements Serializable { + private static final Logger log = LoggerFactory.getLogger(SparkSubjectPropagationStep2.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkSubjectPropagationStep2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String preparedPath = parser.get("preparedPath"); + log.info("preparedPath: {}", preparedPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); + + Class resultClazz = (Class) Class.forName(resultClassName); + + final String resultType = parser.get("resultType"); + log.info("resultType: {}", resultType); + + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + execPropagation(spark, inputPath, outputPath, workingPath, preparedPath, resultClazz, resultType); + }); + } + + private static void execPropagation(SparkSession spark, + String inputPath, + String outputPath, + String workingPath, + String preparedPath, + Class resultClazz, + String resultType) { + + Dataset results = readPath(spark, inputPath + "/" + resultType, resultClazz); + Dataset preparedResult = readPath( + spark, preparedPath + "/publication", ResultSubjectList.class) + .union(readPath(spark, preparedPath + "/dataset", ResultSubjectList.class)) + .union(readPath(spark, preparedPath + "/software", ResultSubjectList.class)) + .union(readPath(spark, preparedPath + "/otherresearchproduct", ResultSubjectList.class)); + + results + .joinWith( + preparedResult, + results.col("id").equalTo(preparedResult.col("resId")), + "left") + .map((MapFunction, R>) t2 -> { + R res = t2._1(); + if (Optional.ofNullable(t2._2()).isPresent()) { + // estraggo le tipologie di subject dal result + Map> resultMap = new HashMap<>(); + res.getSubject().stream().forEach(s -> { + String cid = s.getQualifier().getClassid(); + if (!resultMap.containsKey(cid)) { + resultMap.put(cid, new ArrayList<>()); + } + resultMap.get(cid).add(s.getValue()); + }); + + // Remove from the list all the subjects with the same class already present in the result + List distinctClassId = t2 + ._2() + .getSubjectList() + .stream() + .map(si -> si.getClassid()) + .distinct() + .collect(Collectors.toList()); + List sbjInfo = new ArrayList<>(); + for (String k : distinctClassId) { + if (!resultMap.containsKey(k)) + sbjInfo = t2 + ._2() + .getSubjectList() + .stream() + .filter(s -> s.getClassid().equalsIgnoreCase(k)) + .collect(Collectors.toList()); + else + sbjInfo = t2 + ._2() + .getSubjectList() + .stream() + .filter( + s -> s.getClassid().equalsIgnoreCase(k) && + !resultMap.get(k).contains(s.getValue())) + .collect(Collectors.toList()); + // All the subjects not already present in the result are added + for (SubjectInfo si : sbjInfo) { + res.getSubject().add(getSubject(si)); + } + + } + + } + return res; + }, Encoders.bean(resultClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingPath + "/" + resultType); + + readPath(spark, workingPath + "/" + resultType, resultClazz) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/" + resultType); + + } + + private static Subject getSubject(SubjectInfo si) { + return OafMapperUtils + .subject( + si.getValue(), + si.getClassid(), si.getClassname(), + ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, + OafMapperUtils + .dataInfo( + false, PROPAGATION_DATA_INFO_TYPE, + true, false, + OafMapperUtils + .qualifier( + PROPAGATION_SUBJECT_RESULT_SEMREL_CLASS_ID, + PROPAGATION_SUBJECT_RESULT_SEMREL_CLASS_NAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "0.85")); + + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SubjectInfo.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SubjectInfo.java new file mode 100644 index 000000000..ace40a6d4 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SubjectInfo.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.subjecttoresultfromsemrel; + +import java.io.Serializable; + +/** + * @author miriam.baglioni + * @Date 06/10/22 + */ +public class SubjectInfo implements Serializable { + private String classid; + private String value; + private String classname; + + public static SubjectInfo newInstance(String classid, String classname, String value) { + SubjectInfo si = new SubjectInfo(); + si.classid = classid; + si.value = value; + si.classname = classname; + return si; + } + + public String getClassid() { + return classid; + } + + public void setClassid(String classid) { + this.classid = classid; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + public String getClassname() { + return classname; + } + + public void setClassname(String classname) { + this.classname = classname; + } +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/Utils.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/Utils.java new file mode 100644 index 000000000..6879fb547 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/Utils.java @@ -0,0 +1,17 @@ + +package eu.dnetlib.dhp.subjecttoresultfromsemrel; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Stream; + +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.Subject; + +/** + * @author miriam.baglioni + * @Date 05/10/22 + */ +public class Utils implements Serializable { + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json new file mode 100644 index 000000000..a8ec1d5b3 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json @@ -0,0 +1,46 @@ +[ + + + { + "paramName":"asr", + "paramLongName":"allowedSemRel", + "paramDescription": "the set of semantic relations between the results to be exploited to perform the propagation", + "paramRequired": true + }, + { + "paramName":"sl", + "paramLongName":"subjectlist", + "paramDescription": "the list of classid for the subject we wanti to propagate", + "paramRequired": true + }, + { + "paramName":"rt", + "paramLongName":"resultType", + "paramDescription": "the result type", + "paramRequired": true + }, + { + "paramName":"sp", + "paramLongName":"sourcePath", + "paramDescription": "the path of the input graph", + "paramRequired": true + }, + { + "paramName":"rtn", + "paramLongName":"resultTableName", + "paramDescription": "the class of the result", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json new file mode 100644 index 000000000..0cb51c598 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json @@ -0,0 +1,45 @@ +[ + + { + "paramName":"pp", + "paramLongName":"preparedPath", + "paramDescription": "the path to the prepared information", + "paramRequired": true + }, + { + "paramName":"rt", + "paramLongName":"resultType", + "paramDescription": "the result type", + "paramRequired": true + }, + { + "paramName":"ip", + "paramLongName":"inputPath", + "paramDescription": "the path of the input graph", + "paramRequired": true + }, + { + "paramName":"rtn", + "paramLongName":"resultTableName", + "paramDescription": "the class of the result", + "paramRequired": true + }, + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": false + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store output files", + "paramRequired": true + }, + { + "paramName": "wp", + "paramLongName": "workingPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true +} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/config-default.xml new file mode 100644 index 000000000..caf3c6050 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/config-default.xml @@ -0,0 +1,63 @@ + + + jobTracker + yarnRM + + + + nameNode + + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + + spark2MaxExecutors + 50 + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml new file mode 100644 index 000000000..b7f48a4e0 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml @@ -0,0 +1,307 @@ + + + + + + + + + + + + sourcePath + the source path + + + subjectlist + the list of subject classid to propagate (split by ;) + + + resultType + the result tapy + + + resultTableName + the class of the result + + + allowedsemrels + the allowed semantics + + + outputPath + the output path + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + yarn + cluster + PrepareProjectResultsAssociation + eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --allowedsemrels${allowedsemrels} + --subjectlist${subjectlist} + --resultTypepublication + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --outputPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + PrepareProjectResultsAssociation + eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --allowedsemrels${allowedsemrels} + --subjectlist${subjectlist} + --resultTypedataset + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --outputPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + PrepareProjectResultsAssociation + eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --allowedsemrels${allowedsemrels} + --subjectlist${subjectlist} + --resultTypesoftware + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --outputPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + PrepareProjectResultsAssociation + eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1 + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --allowedsemrels${allowedsemrels} + --subjectlist${subjectlist} + --resultTypeotherresearchproduct + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --outputPath${workingDir}/preparedInfo + + + + + + + + + + + + + + + + + yarn + cluster + ProjectToResultPropagation + eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --inputPath${sourcePath} + --outputPath${outputPath} + --workingPath${workingDir}/working + --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication + --resultTypepublication + --preparedPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + ProjectToResultPropagation + eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --inputPath${sourcePath} + --outputPath${outputPath} + --workingPath${workingDir}/working + --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct + --resultTypeotherresearchproduct + --preparedPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + ProjectToResultPropagation + eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --inputPath${sourcePath} + --outputPath${outputPath} + --workingPath${workingDir}/working + --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset + --resultTypedataset + --preparedPath${workingDir}/preparedInfo + + + + + + + + yarn + cluster + ProjectToResultPropagation + eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --inputPath${sourcePath} + --outputPath${outputPath} + --workingPath${workingDir}/working + --resultTableNameeu.dnetlib.dhp.schema.oaf.Software + --resultTypesoftware + --preparedPath${workingDir}/preparedInfo + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPreparationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPreparationJobTest.java new file mode 100644 index 000000000..f782feab9 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPreparationJobTest.java @@ -0,0 +1,159 @@ + +package eu.dnetlib.dhp.subjectpropagation; + +import static org.apache.spark.sql.functions.desc; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest; +import eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Subject; +import eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1; +import eu.dnetlib.dhp.subjecttoresultfromsemrel.ResultSubjectList; +import eu.dnetlib.dhp.subjecttoresultfromsemrel.SubjectInfo; + +/** + * @author miriam.baglioni + * @Date 05/10/22 + */ +public class SubjectPreparationJobTest { + private static final Logger log = LoggerFactory.getLogger(SubjectPreparationJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(SubjectPreparationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(SubjectPreparationJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(SubjectPreparationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + void testSparkSubjectToResultThroughSemRelJob() throws Exception { + PrepareResultResultStep1 + .main( + new String[] { + "-allowedSemRel", + "IsSupplementedBy;IsSupplementTo;IsPreviousVersionOf;IsNewVersionOf;IsIdenticalTo;Obsoletes;IsObsoletedBy;IsVersionOf", + "-subjectlist", "fos;sdg", + "-resultType", "publication", + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", getClass() + .getResource("/eu/dnetlib/dhp/subjectpropagation") + .getPath(), + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-outputPath", workingDir.toString() + + }); + + // 50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba fake_fos and fake_sdg IsVersionOf + // 50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98 + // 50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f fake_fos2 obsoletes + // 50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98 + // 50|355e65625b88::046477dc24819c5f1453166aa7bfb75e fake_fos2 isSupplementedBy + // 50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98 + // 50|355e65625b88::046477dc24819c5f1453166aa7bfb75e fake_fos2 issupplementto + // 50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/publication") + .map(item -> OBJECT_MAPPER.readValue(item, ResultSubjectList.class)); + + Assertions.assertEquals(2, tmp.count()); + + Assertions + .assertEquals( + 1, tmp.filter(r -> r.getResId().equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")).count()); + Assertions + .assertEquals( + 1, tmp.filter(r -> r.getResId().equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")).count()); + + List sbjList = tmp + .filter(r -> r.getResId().equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubjectList(); + + Assertions.assertEquals(3, sbjList.size()); + Assertions.assertEquals(1, sbjList.stream().filter(s -> s.getClassid().equals("sdg")).count()); + Assertions.assertEquals(2, sbjList.stream().filter(s -> s.getClassid().equals("fos")).count()); + + Assertions + .assertEquals( + "fake_sdg", + sbjList.stream().filter(s -> s.getClassid().equalsIgnoreCase("sdg")).findFirst().get().getValue()); + Assertions + .assertTrue( + sbjList + .stream() + .filter(s -> s.getClassid().equalsIgnoreCase("fos")) + .anyMatch(s -> s.getValue().equals("fake_fos"))); + Assertions + .assertTrue( + sbjList + .stream() + .filter(s -> s.getClassid().equalsIgnoreCase("fos")) + .anyMatch(s -> s.getValue().equals("fake_fos2"))); + + sbjList = tmp + .filter(r -> r.getResId().equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) + .first() + .getSubjectList(); + + Assertions.assertEquals(1, sbjList.size()); + Assertions.assertEquals("fos", sbjList.get(0).getClassid().toLowerCase()); + + Assertions.assertEquals("fake_fos2", sbjList.get(0).getValue()); + + tmp.foreach(s -> System.out.println(OBJECT_MAPPER.writeValueAsString(s))); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPropagationJobTest.java new file mode 100644 index 000000000..b324b49d8 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPropagationJobTest.java @@ -0,0 +1,333 @@ + +package eu.dnetlib.dhp.subjectpropagation; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.PropagationConstant; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Subject; +import eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2; + +/** + * @author miriam.baglioni + * @Date 06/10/22 + */ +public class SubjectPropagationJobTest { + private static final Logger log = LoggerFactory.getLogger(SubjectPropagationJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(SubjectPropagationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(SubjectPropagationJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(SubjectPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + void testSparkSubjectToResultThroughSemRelJob() throws Exception { + SparkSubjectPropagationStep2 + .main( + new String[] { + "-preparedPath", getClass() + .getResource("/eu/dnetlib/dhp/subjectpropagation/preparedInfo") + .getPath(), + "-resultType", "publication", + "-inputPath", getClass() + .getResource("/eu/dnetlib/dhp/subjectpropagation") + .getPath(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-workingPath", workingDir.toString() + "/working", + "-outputPath", workingDir.toString() + }); + + // 50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98 should receive fake_fos, fake_sdg and fake_fos2 + // 50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba should receive fake_fos2 + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/publication") + .map(item -> OBJECT_MAPPER.readValue(item, Publication.class)); + + Assertions.assertEquals(4, tmp.count()); + + Assertions + .assertEquals( + 2, tmp + .filter( + r -> r + .getSubject() + .stream() + .anyMatch( + s -> s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE))) + .count()); + + JavaRDD sbjs = tmp + .flatMap((FlatMapFunction) r -> r.getSubject().iterator()) + .filter( + s -> s.getDataInfo().getInferenceprovenance().equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE)); + + Assertions.assertEquals(4, sbjs.count()); + Assertions + .assertEquals( + 4, sbjs + .filter( + s -> s + .getDataInfo() + .getProvenanceaction() + .getClassid() + .equals(PropagationConstant.PROPAGATION_SUBJECT_RESULT_SEMREL_CLASS_ID)) + .count()); + Assertions + .assertEquals( + 4, + sbjs + .filter( + s -> s + .getDataInfo() + .getProvenanceaction() + .getClassname() + .equals(PropagationConstant.PROPAGATION_SUBJECT_RESULT_SEMREL_CLASS_NAME)) + .count()); + Assertions.assertEquals(3, sbjs.filter(s -> s.getQualifier().getClassid().equals("FOS")).count()); + Assertions + .assertEquals(3, sbjs.filter(s -> s.getQualifier().getClassname().equals("Field of Science")).count()); + Assertions.assertEquals(1, sbjs.filter(s -> s.getQualifier().getClassid().equals("SDG")).count()); + Assertions + .assertEquals( + 1, sbjs.filter(s -> s.getQualifier().getClassname().equals("Support and Development Goals")).count()); + + Assertions + .assertEquals( + 6, + tmp + .filter(r -> r.getId().equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubject() + .size()); + Assertions + .assertEquals( + 3, tmp + .filter( + r -> r + .getId() + .equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubject() + .stream() + .filter( + s -> s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE)) + .count()); + Assertions + .assertEquals( + 3, tmp + .filter( + r -> r + .getId() + .equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubject() + .stream() + .filter( + s -> !s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE)) + .count()); + Assertions + .assertEquals( + 2, tmp + .filter( + r -> r + .getId() + .equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubject() + .stream() + .filter( + s -> s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE) && + s.getQualifier().getClassid().equals("FOS")) + .count()); + Assertions + .assertEquals( + 1, tmp + .filter( + r -> r + .getId() + .equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubject() + .stream() + .filter( + s -> s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE) && + s.getQualifier().getClassid().equals("SDG")) + .count()); + + Assertions + .assertTrue( + tmp + .filter(r -> r.getId().equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("fake_fos"))); + Assertions + .assertTrue( + tmp + .filter(r -> r.getId().equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("fake_fos2"))); + Assertions + .assertTrue( + tmp + .filter(r -> r.getId().equals("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) + .first() + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("fake_sdg"))); + + Assertions + .assertEquals( + 6, + tmp + .filter(r -> r.getId().equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) + .first() + .getSubject() + .size()); + Assertions + .assertEquals( + 1, tmp + .filter( + r -> r + .getId() + .equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) + .first() + .getSubject() + .stream() + .filter( + s -> s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE)) + .count()); + Assertions + .assertEquals( + 5, tmp + .filter( + r -> r + .getId() + .equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) + .first() + .getSubject() + .stream() + .filter( + s -> !s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE)) + .count()); + Assertions + .assertEquals( + 1, tmp + .filter( + r -> r + .getId() + .equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) + .first() + .getSubject() + .stream() + .filter( + s -> s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE) && + s.getQualifier().getClassid().equals("FOS")) + .count()); + Assertions + .assertEquals( + 0, tmp + .filter( + r -> r + .getId() + .equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) + .first() + .getSubject() + .stream() + .filter( + s -> s + .getDataInfo() + .getInferenceprovenance() + .equals(PropagationConstant.PROPAGATION_DATA_INFO_TYPE) && + s.getQualifier().getClassid().equals("SDG")) + .count()); + + Assertions + .assertTrue( + tmp + .filter(r -> r.getId().equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) + .first() + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("fake_fos2"))); + + } +} diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/dataset/dataset.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/dataset/dataset.json new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/otherresearchproduct/orp.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/otherresearchproduct/orp.json new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/publication/preparedInfo b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/publication/preparedInfo new file mode 100644 index 000000000..abbea5506 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/publication/preparedInfo @@ -0,0 +1,2 @@ +{"resId":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba","subjectList":[{"classid":"FOS", "classname":"Field of Science","value":"fake_fos2"}]} +{"resId":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98","subjectList":[{"classid":"FOS", "classname":"Field of Science","value":"fake_fos"},{"classid":"SDG","classname":"Support and Development Goals","value":"fake_sdg"},{"classid":"FOS", "classname":"Field of Science","value":"fake_fos2"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/software/software.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/preparedInfo/software/software.json new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/publication/publication.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/publication/publication.json new file mode 100644 index 000000000..da5a26580 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/publication/publication.json @@ -0,0 +1,4 @@ +{"author":[{"fullname":"Levande, Paul","name":"Paul","pid":[],"rank":1,"surname":"Levande"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Coordination Episciences iam"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2011-01-01"},"dateofcollection":"2022-04-12T19:57:46.9Z","dateoftransformation":"2022-04-12T20:18:26.16Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"We examine the $q=1$ and $t=0$ special cases of the parking functions conjecture. The parking functions conjecture states that the Hilbert series for the space of diagonal harmonics is equal to the bivariate generating function of $area$ and $dinv$ over the set of parking functions. Haglund recently proved that the Hilbert series for the space of diagonal harmonics is equal to a bivariate generating function over the set of Tesler matrices–upper-triangular matrices with every hook sum equal to one. We give a combinatorial interpretation of the Haglund generating function at $q=1$ and prove the corresponding case of the parking functions conjecture (first proven by Garsia and Haiman). We also discuss a possible proof of the $t = 0$ case consistent with this combinatorial interpretation. We conclude by briefly discussing possible refinements of the parking functions conjecture arising from this research and point of view. $\\textbf{Note added in proof}$: We have since found such a proof of the $t = 0$ case and conjectured more detailed refinements. This research will most likely be presented in full in a forthcoming article."},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"On examine les cas spéciaux $q=1$ et $t=0$ de la conjecture des fonctions de stationnement. Cette conjecture déclare que la série de Hilbert pour l'espace des harmoniques diagonaux est égale à la fonction génératrice bivariée (paramètres $area$ et $dinv$) sur l'ensemble des fonctions de stationnement. Haglund a prouvé récemment que la série de Hilbert pour l'espace des harmoniques diagonaux est égale à une fonction génératrice bivariée sur l'ensemble des matrices de Tesler triangulaires supérieures dont la somme de chaque équerre vaut un. On donne une interprétation combinatoire de la fonction génératrice de Haglund pour $q=1$ et on prouve le cas correspondant de la conjecture dans le cas des fonctions de stationnement (prouvé d'abord par Garsia et Haiman). On discute aussi d'une preuve possible du cas $t=0$, cohérente avec cette interprétation combinatoire. On conclut en discutant brièvement les raffinements possibles de la conjecture des fonctions de stationnement de ce point de vue. $\\textbf{Note ajoutée sur épreuve}$: j'ai trouvé depuis cet article une preuve du cas $t=0$ et conjecturé des raffinements possibles. Ces résultats seront probablement présentés dans un article ultérieur."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.46298/dmtcs.2940"}],"collectedfrom":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2011-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://dmtcs.episciences.org/2940"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1658994348190,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Foai.episciences.org%2F","datestamp":"2011-01-01","harvestDate":"2022-04-12T19:57:46.9Z","identifier":"oai:episciences.org:dmtcs:2940","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba","oai:episciences.org:dmtcs:2940"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"ISSN: 1365-8050"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Discrete Mathematics & Theoretical Computer Science"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Episciences.org"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"dmtcs:2940 - Discrete Mathematics & Theoretical Computer Science, 2011-01-01, DMTCS Proceedings vol. AO, 23rd International Conference on Formal Power Series and Algebraic Combinatorics (FPSAC 2011)"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"fos","classname":"Field of Science","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"fake_fos"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Hilbert series"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"sdg","classname":"Support Development Goal","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"fake_sdg"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"[MATH.MATH-CO] Mathematics [math]/Combinatorics [math.CO]"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"acm","classname":"acm","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"[INFO.INFO-DM] Computer Science [cs]/Discrete Mathematics [cs.DM]"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Special Cases of the Parking Functions Conjecture and Upper-Triangular Matrices"}]} +{"author":[{"fullname":"Blondin, Michael","name":"Michael","pid":[],"rank":1,"surname":"Blondin"},{"fullname":"Raskin, Mikhail","name":"Mikhail","pid":[],"rank":2,"surname":"Raskin"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Michael Blondin"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2021-07-20"},"dateofcollection":"2022-04-12T19:57:21.4Z","dateoftransformation":"2022-04-12T20:22:30.288Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Vector addition systems with states (VASS) are widely used for the formalverification of concurrent systems. Given their tremendous computationalcomplexity, practical approaches have relied on techniques such as reachabilityrelaxations, e.g., allowing for negative intermediate counter values. It isnatural to question their feasibility for VASS enriched with primitives thattypically translate into undecidability. Spurred by this concern, we pinpointthe complexity of integer relaxations with respect to arbitrary classes ofaffine operations. More specifically, we provide a trichotomy on the complexity of integerreachability in VASS extended with affine operations (affine VASS). Namely, weshow that it is NP-complete for VASS with resets, PSPACE-complete for VASS with(pseudo-)transfers and VASS with (pseudo-)copies, and undecidable for any otherclass. We further present a dichotomy for standard reachability in affine VASS:it is decidable for VASS with permutations, and undecidable for any otherclass. This yields a complete and unified complexity landscape of reachabilityin affine VASS. We also consider the reachability problem parameterized by afixed affine VASS, rather than a class, and we show that the complexitylandscape is arbitrary in this setting."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.46298/lmcs-17(3:3)2021"}],"collectedfrom":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2021-07-20"},"distributionlocation":"","hostedby":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://lmcs.episciences.org/7687"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1658994354107,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Foai.episciences.org%2F","datestamp":"2021-07-20","harvestDate":"2022-04-12T19:57:21.4Z","identifier":"oai:episciences.org:lmcs:7687","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98","oai:episciences.org:lmcs:7687"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"ISSN: 1860-5974"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Logical Methods in Computer Science"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Episciences.org"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"lmcs:6872 - Logical Methods in Computer Science, 2021-07-20, Volume 17, Issue 3"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Computer Science - Logic in Computer Science"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Computer Science - Computational Complexity"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Computer Science - Formal Languages and Automata Theory"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"The Complexity of Reachability in Affine Vector Addition Systems with States"}]} +{"author":[{"fullname":"Ward, Mark Daniel","name":"Mark Daniel","pid":[],"rank":1,"surname":"Ward"},{"fullname":"Szpankowski, Wojciech","name":"Wojciech","pid":[],"rank":2,"surname":"Szpankowski"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Coordination Episciences iam"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2005-01-01"},"dateofcollection":"2022-04-12T19:57:43.247Z","dateoftransformation":"2022-04-12T20:25:54.387Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"In a suffix tree, the multiplicity matching parameter (MMP) $M_n$ is the number of leaves in the subtree rooted at the branching point of the $(n+1)$st insertion. Equivalently, the MMP is the number of pointers into the database in the Lempel-Ziv '77 data compression algorithm. We prove that the MMP asymptotically follows the logarithmic series distribution plus some fluctuations. In the proof we compare the distribution of the MMP in suffix trees to its distribution in tries built over independent strings. Our results are derived by both probabilistic and analytic techniques of the analysis of algorithms. In particular, we utilize combinatorics on words, bivariate generating functions, pattern matching, recurrence relations, analytical poissonization and depoissonization, the Mellin transform, and complex analysis."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.46298/dmtcs.3387"}],"collectedfrom":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2005-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://dmtcs.episciences.org/3387"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1658994359132,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Foai.episciences.org%2F","datestamp":"2005-01-01","harvestDate":"2022-04-12T19:57:43.247Z","identifier":"oai:episciences.org:dmtcs:3387","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f","oai:episciences.org:dmtcs:3387"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"ISSN: 1365-8050"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Discrete Mathematics & Theoretical Computer Science"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Episciences.org"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"dmtcs:3387 - Discrete Mathematics & Theoretical Computer Science, 2005-01-01, DMTCS Proceedings vol. AD, International Conference on Analysis of Algorithms"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"fos","classname":"Fild of Science","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"fake_fos2"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"complex asymptotics"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"suffix trees"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"combinatorics on words"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"pattern matching"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"autocorrelation polynomial"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"[INFO.INFO-DS] Computer Science [cs]/Data Structures and Algorithms [cs.DS]"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"[INFO.INFO-DM] Computer Science [cs]/Discrete Mathematics [cs.DM]"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"[MATH.MATH-CO] Mathematics [math]/Combinatorics [math.CO]"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"[INFO.INFO-CG] Computer Science [cs]/Computational Geometry [cs.CG]"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Analysis of the multiplicity matching parameter in suffix trees"}]} +{"author":[{"fullname":"Södergård, Caj","name":"Caj","pid":[],"rank":1,"surname":"Södergård"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1989-01-01"},"dateofcollection":"2022-07-09T12:22:11.472Z","dateoftransformation":"2022-07-09T12:45:18.112Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::046477dc24819c5f1453166aa7bfb75e","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1989-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0004","classname":"Conference object","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/42136eb0-696d-4861-b587-3b451a46a914"]}],"language":{"classid":"fin","classname":"Finnish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1658994245711,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-06-27T08:32:36Z","harvestDate":"2022-07-09T12:22:11.472Z","identifier":"oai:cris.vtt.fi:publications/42136eb0-696d-4861-b587-3b451a46a914","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::046477dc24819c5f1453166aa7bfb75e","oai:cris.vtt.fi:publications/42136eb0-696d-4861-b587-3b451a46a914"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Södergård , C 1989 , ' Telefax ja telefoto ' , Kehittyvä tiedonsiirto graafisessa yrityksessä , Helsinki , Finland , 29/05/89 - 30/05/89 ."}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"fos","classname":"Fild of Science","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"fake_fos2"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Telefax ja telefoto"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/relation/relation.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/relation/relation.json new file mode 100644 index 000000000..03f9d1298 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/subjectpropagation/relation/relation.json @@ -0,0 +1,10 @@ +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"IsVersionOf","relType":"datasourceOrganization","source":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba","subRelType":"provision","target":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"Obsolets","relType":"datasourceOrganization","source":"50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f","subRelType":"provision","target":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"IsSupplementedBy","relType":"datasourceOrganization","source":"50|355e65625b88::046477dc24819c5f1453166aa7bfb75e","subRelType":"provision","target":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"IsSupplementTo","relType":"datasourceOrganization","source":"50|355e65625b88::046477dc24819c5f1453166aa7bfb75e","subRelType":"provision","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::8b75543067b50076e70764917e188178","subRelType":"provision","target":"20|doajarticles::50cb15ff7a6a3f8531f063770179e346","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::9f3ff882f023209d9ffb4dc32b77d376","subRelType":"provision","target":"20|doajarticles::ffc1811633b3222e4764c7b0517f83e8","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::ccc002b33c28ea8a1eab16db2eebd7a5","subRelType":"provision","target":"20|pending_org_::ab7b11bb317a6249f9b6becc7dd98043","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::d70ec79cbd10ebee6e6f54689d2cc7a9","subRelType":"provision","target":"20|openorgs____::9d6dceaf5e56ef060226e4ef7faa28a0","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::e2d4766f243c621e9a9300e6aa74d5a0","subRelType":"provision","target":"20|pending_org_::cfea8083b4f958b42314b50497aebd59","validated":false} +{"collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles"}],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.900"},"lastupdatetimestamp":1658906673376,"properties":[],"relClass":"isProvidedBy","relType":"datasourceOrganization","source":"10|doajarticles::fc496a0a4bb49d654649598b04008682","subRelType":"provision","target":"20|pending_org_::a41c5f55ae53f44abb8bc0e89398074e","validated":false} \ No newline at end of file -- 2.17.1 From 3d496684313b7de52035a6b84755e001493e6be8 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Fri, 4 Nov 2022 17:36:09 +0100 Subject: [PATCH 2/3] [Subject Propagation] test and workflow definition --- .../null/publication/._SUCCESS.crc | Bin 8 -> 0 bytes ...cb-470f-b7ad-68b08e2882ec-c000.json.gz.crc | Bin 48 -> 0 bytes .../dhp-enrichment/null/publication/_SUCCESS | 0 ...2-40cb-470f-b7ad-68b08e2882ec-c000.json.gz | Bin 4714 -> 0 bytes .../eu/dnetlib/dhp/PropagationConstant.java | 33 ++++++++-- .../PrepareResultResultStep1.java | 16 +++-- .../SparkSubjectPropagationStep2.java | 48 +++++++++------ ...put_preparesubjecttoresult_parameters.json | 2 +- .../input_propagatesubject_parameters.json | 4 +- .../oozie_app/config-default.xml | 2 +- .../subjectpropagation/oozie_app/workflow.xml | 58 +++++++----------- .../SubjectPreparationJobTest.java | 2 +- .../SubjectPropagationJobTest.java | 2 +- 13 files changed, 99 insertions(+), 68 deletions(-) delete mode 100644 dhp-workflows/dhp-enrichment/null/publication/._SUCCESS.crc delete mode 100644 dhp-workflows/dhp-enrichment/null/publication/.part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz.crc delete mode 100644 dhp-workflows/dhp-enrichment/null/publication/_SUCCESS delete mode 100644 dhp-workflows/dhp-enrichment/null/publication/part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz diff --git a/dhp-workflows/dhp-enrichment/null/publication/._SUCCESS.crc b/dhp-workflows/dhp-enrichment/null/publication/._SUCCESS.crc deleted file mode 100644 index 3b7b044936a890cd8d651d349a752d819d71d22c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8 PcmYc;N@ieSU}69O2$TUk diff --git a/dhp-workflows/dhp-enrichment/null/publication/.part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz.crc b/dhp-workflows/dhp-enrichment/null/publication/.part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz.crc deleted file mode 100644 index 24a358fe2ef5e6c14d29676fc594795138aeccce..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 48 zcmYc;N@ieSU}8whoSoM(DO0-d`{FFFT_+eH+6y_Crp+l;xVfQhbNf}z8FtSFy~G9i E0lCl;(*OVf diff --git a/dhp-workflows/dhp-enrichment/null/publication/_SUCCESS b/dhp-workflows/dhp-enrichment/null/publication/_SUCCESS deleted file mode 100644 index e69de29bb..000000000 diff --git a/dhp-workflows/dhp-enrichment/null/publication/part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz b/dhp-workflows/dhp-enrichment/null/publication/part-00000-d0707c22-40cb-470f-b7ad-68b08e2882ec-c000.json.gz deleted file mode 100644 index 768f68d92df8f70eedbd68f24fe821073db26d3b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4714 zcmV-w5|!;AiwFP!000000PS7fZrjL`zTc;42tJzxE6F7FFJLabc7BW#$42Ct0ZvX3 zw8*B^G0En1H|2P80_^2tZ}$cEVi!1hfqenro~PI+*{W_*qJAvf6MGV87zDORc6U{M zRbBnVPwkI2n(3*Kwc(qOwJ?jKgvP8kti5D2ngpy)PG}a@>b3lKewFe7D!i@NBu&n0 zL$_X2nOv?}`vi~2Olj)-Oex7HQ(YT=toadD3ai(SPoBPlw#<#Z?s3WzvS%v56Xj1? z%uLxJVR~p=dKZg;Egt7J*9&}trbQGnU$Y>TA~qJDu?r~dI)4)&iE_!_!N1|K+xOaI zZ_xJ#-7y_FUD{$@+H*RC77fF3*a2xX8f7TxY08z)S>mIJwsx==h$h15e|3Z>3zbT*VM6^et37c1z$pmGKd199uvFU0H_F`FH*UJAURWu^e1 zn$sNMWdY>A5HjEi#d4&CD&aKFk<9?jivrmhlXSxDRfX$1u7C*>A$EaOTT83>+ZKPG ztLl+}>?R48Oj*UNj((CrVsZqx0W z3QYNur?y;`7Ph+OXGYiu8uNq^J!NF)mml060s{)!(LXplM5WB2-RqsCdd$1b3hZNDi6%|NiYivovM00hctHWD#9F zBj;RC34;bM$W(|kqOy1)SR~*l@ce|&7-->(c#I>1kmza-1FTFX)3gB3n3`j;Nl!py z5jO@SQ^gn1X;dnb083*8-6Vj>`^28_)9Zvwl%FmPn zBcL~#E{PREOfG3hlOG7gv5^i!6o;ObXbX>?a<$cXBL`~vQ5G;%Wek1}ArNx0Xv18t zWFhn-=!yD99fg47O#?VtvZ?@Dj7ooMaTK5nHZ0tiH& zSZhD+3{lf51>~Vk2)^Ryg6Zw6;UR=NBPxj6ff zcQ(igHb+BfMzkDRfyqTh$OSgz!e{HtCfR3Nv!x2qGGq@%e#uN*>hd!LI@T;QTq-h2 zqDRc8HWZMjG^nA9JO`=)Lge!EghGF~{4Z>((1T1aKPS+v%};sayZj8@9Xbh(FaNF~ zdWtlSrkO1L1VY%zO%#iUbEHx@-coD!F$7n6U2(02yDH?M58Gb_elNqciL6;_hm%SeuE&$y%jv7V_2tP<}C6)NS>#C zG!auTWny^A98VLiy z<=s2{O=!84gr+o$I=W3 zH2^KowWI`485hKLuAr%Tl{J_!6!s&PMWD69=?(@Z_-ss}fUy9P#Ox10t>d#~p+Thc zcy5}NCRoKJOp9K#d9HtBZ_9i@z)L&V&p*RxvRL+;#M^QH=5+laXoX?B=QzXRpw;gT z#(~#!J=bqB*7LiJwa4Mu?F}4%OwEX)^0gYQ3clh-35^T@njHoJ84o$D_Ajm)5pm9% zBUIKj5PU_Q13uvzrf|n&jA>GWc7u96c&9JYE8MksS{jjEr`hg$gZ^$1YhN|JLEFi< zO_nyj?tIUy+G+U0od%fkn>|9P6$!uCJpd_Fp$u&I;q$Y=QyC)GCq4YQEoyf z8%NwK{SL-j7>|9n8K6xCKb5zRO|nIguPEcJDD&g1KfgNu`PBwxB*V4Hy9kz*(PlR- zmjLmZj6i&Cs&%S{yLsPX3#ghxPIgUSdW)SY!dZ~nb}TE&lF5pWPm@W+)pUc7MGKqt zTRi=Q!j#TZ>|L72xN{z-wV~VX^aq1>tKD}8PQ4~5FL#}fHCQH2cydtQVftwEIYJ8& z2|Z^FORFDWzXYMi?mxBm{^UJ_|IpZ~?Ql2W;mYNgqZdNdcANuvzHNt$mxd8kdAK5F zXmB4Ioa?n1)_0gcUI8#>#+xe&{y9HCZ<-zeYdk*7-E+Kmxa&2YU1<1S;P2Kx0d^J! z8XaPE{id<_8a3Z8g>Hl|IdouPuy<^PulKG+^e)A8a+ZkmWJA0l5c^!S?Y(Qg)F-0^ z%xuOcQ0%YM;;N03BMwQ*Gqvaxl+{}V*AK&I2yeg61UauH2O}*sdgc* z!Rr9JrvyaE5!KKR1g@{hKjRLsfZGts5!n}U3d=D`Mz$kYm|lbF)8(93;dKpOi!p&T z$ZrV9I$4aUb#icYx<4W(Qur_x;>kpjnTVQX?-+`;WIzrRyE5Zu+kRgZh~wtRGhC2H zvqc@H&FMY2^&kweUx47C4*j%pPh{M_Ko-_W~rbuvoxvF^zD{J1Qz!`Sqc9)Hg?ar!SiLqp^Sdc4->$27XW9 z?kF% z^31b2d1k*vo`H4OkB0nox*$Its~#-==zQ!|63V z=cc@E&uzAE%-i<7VasWH{eDHQmf}Uk zK5%_eC#ggNSqtWEEte5oFvb3B3H};L!&my5Y`3;B~biO6Au1?IfG?Bqc~%zwIyT}!okQh zjUrxzB`}Maf|3TxRPvakAj?>XX(!qRP@2GwsHKt#k|hC5VuD=zCK;s&Vt}mSoM#Mf zl`um%XC&o`;d)t?bO9TJkH$+o*2MgXmkhe>vV3x`Ks^0LKLf+6<9^Upcr> z`k){5damntJ3+_o_dA2o4f-&f?K01AcY;B`R?!C^;&A!R(Fdd0R}HuKxHW7&!MRmM zBK)S}<%(%Tl)ANc)nr@JusF~KAf6?+zw}c2^~&qkz5LCiWkc*0 zY=}{r4KXroh^?%^pQ#M$7wz{oo)2MQY#fluZ@JRv(r?~)xP=U|m2i^uQ; zhU17)WX5sJAA<=Qwo9ZCcnoCD@)?6W^G1s^EzLockCT6KpX?|NwH1@7e3Lx=CF7v3 zId0Xe2h!KYnud_I;OAA$AvN&Z?<1c(h9`$H(8WYU;woJ3sohOb{Sxlqo!~kC8CSe4Q1HyhE#6 zYL4SPxAZ6EW*Ys=4de(J)048t6!QPLUXh0;8!g5ekF-5DmL~yAk)nq*PI#J3K!M{7 znT7b`PZ?5Z0LsrSk!vgr*AY*2gSnN$H3d%IX+on5yeP3XbM4F{#m)5`O>1RpPJYAh zWEjdJXc_TeS^20{Um@oZ_lQ(|`zg5aJf#{5PxIicigNZ%MamTur;2j+?NH7(pIc}8(l>T)U1#97ystwy zn@8{U=hn?9{5L$g&LI7}FwSl;8egMkMLDY|XZJ-pt4^-FAzszVbrl8eYvZ*T6Mp;2 zb;X!^^~rSy^{t22?Uh#Pd-GL#MQ8i%E4WR^(S2Vpxc^xG{013J^X;ek9%OYU-NW~4 zc^|iO=Yx0W=H2Ydu@By{tFId>mp*ux(o75U*Q)HhA_8SD5)pG6RnC3z&Mlu0IlA(E zNPGc20vpZwar1+_^YxHDDRNYu68e2b!&f>jw2FbR9Rt-V!GB!eFZ%!f^Q-#*^0&X9 zGZ8Z_FH8o2N)<)?0a3)u8=NGYJmF8(kyc9fjBnUnCZV{=!G7X1nKI%SbM2v^zt$-*jDXosoQpw82)V!@8ZW*BSTw!=cmewtKz6_wX+j`kl~i zcUo?@OKEQ$j(Z(eyD4q(cgj4Qxr4VLqTOXis`}6u_MtB!r`@e1w-VOcE0~qvjoO!= z-#v8=a*=Azv~GrcmnFL;rPXfRb6Z_Db{gG5H)yo`UAHmr^m~ogxZQDSyGsXddmF(m zhSh%xhFQRfM@aAFoPFHv(%ng5wu_ctlPCsE~dj?*UXD%E804%m0RTDhu!^WG>;^L{b`Z1#=lnE+i}}&+s#ps*@Lxad~pa;UnZdJi-`0 sd4pZ2vkSr+q~+uvn-$;Yd*<6zyG0LXw`fD}H#) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } + public static Dataset readOafKryoPath( + SparkSession spark, String inputPath, Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.kryo(clazz)); + } + + public static Class[] getModelClasses() { + List> modelClasses = Lists.newArrayList(ModelSupport.getOafModelClasses()); + modelClasses + .addAll( + Lists + .newArrayList( + Result.class, + Qualifier.class, + DataInfo.class, + Publication.class, + eu.dnetlib.dhp.schema.oaf.Dataset.class, + Software.class, + OtherResearchProduct.class, + Subject.class, + AccessRight.class)); + return modelClasses.toArray(new Class[] {}); + } + } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/PrepareResultResultStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/PrepareResultResultStep1.java index 0a82c3981..f35ad52e1 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/PrepareResultResultStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/PrepareResultResultStep1.java @@ -70,7 +70,7 @@ public class PrepareResultResultStep1 implements Serializable { final List allowedSemRel = Arrays .asList( - parser.get("allowedSemRel").split(";")) + parser.get("allowedsemrels").split(";")) .stream() .map(s -> s.toLowerCase()) .collect(Collectors.toList()); @@ -98,7 +98,7 @@ public class PrepareResultResultStep1 implements Serializable { Dataset result = readPath(spark, inputPath + "/" + resultType, resultClazz) .filter( (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && - !r.getDataInfo().getInvisible() && + !r.getDataInfo().getInvisible() && Optional.ofNullable(r.getSubject()).isPresent() && r .getSubject() .stream() @@ -116,22 +116,28 @@ public class PrepareResultResultStep1 implements Serializable { (MapGroupsFunction, ResultSubjectList>) (k, it) -> getResultSubjectList(subjectClassList, k, it), Encoders.bean(ResultSubjectList.class)) + .filter(Objects::nonNull) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + "/" + resultType); } - @NotNull private static ResultSubjectList getResultSubjectList(List subjectClassList, String k, Iterator> it) { + Tuple2 first = it.next(); + if (!Optional.ofNullable(first._1()).isPresent()) { + return null; + } ResultSubjectList rsl = new ResultSubjectList(); rsl.setResId(k); - Tuple2 first = it.next(); List sbjInfo = new ArrayList<>(); Set subjectSet = new HashSet<>(); extracted(subjectClassList, first._1().getSubject(), sbjInfo, subjectSet); - it.forEachRemaining(t2 -> extracted(subjectClassList, t2._1().getSubject(), sbjInfo, subjectSet)); + it.forEachRemaining(t2 -> { + if (Optional.ofNullable(t2._1()).isPresent()) + extracted(subjectClassList, t2._1().getSubject(), sbjInfo, subjectSet); + }); rsl.setSubjectList(sbjInfo); return rsl; } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SparkSubjectPropagationStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SparkSubjectPropagationStep2.java index d546a8d8f..2a3bcff51 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SparkSubjectPropagationStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/subjecttoresultfromsemrel/SparkSubjectPropagationStep2.java @@ -50,6 +50,7 @@ public class SparkSubjectPropagationStep2 implements Serializable { final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); + final String resultClassName = parser.get("resultTableName"); log.info("resultTableName: {}", resultClassName); @@ -58,14 +59,15 @@ public class SparkSubjectPropagationStep2 implements Serializable { final String resultType = parser.get("resultType"); log.info("resultType: {}", resultType); - final String inputPath = parser.get("inputPath"); + final String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); final String workingPath = parser.get("workingPath"); log.info("workingPath: {}", workingPath); SparkConf conf = new SparkConf(); - + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(getModelClasses()); runWithSparkSession( conf, isSparkSessionManaged, @@ -83,7 +85,11 @@ public class SparkSubjectPropagationStep2 implements Serializable { Class resultClazz, String resultType) { - Dataset results = readPath(spark, inputPath + "/" + resultType, resultClazz); + Dataset> results = readOafKryoPath(spark, inputPath + "/" + resultType, resultClazz) + .map( + (MapFunction>) r -> new Tuple2(r.getId(), r), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(resultClazz))); + Dataset preparedResult = readPath( spark, preparedPath + "/publication", ResultSubjectList.class) .union(readPath(spark, preparedPath + "/dataset", ResultSubjectList.class)) @@ -93,20 +99,26 @@ public class SparkSubjectPropagationStep2 implements Serializable { results .joinWith( preparedResult, - results.col("id").equalTo(preparedResult.col("resId")), + results.col("_1").equalTo(preparedResult.col("resId")), "left") - .map((MapFunction, R>) t2 -> { - R res = t2._1(); + .map((MapFunction, ResultSubjectList>, String>) t2 -> { + R res = t2._1()._2(); + // estraggo le tipologie di subject dal result + Map> resultMap = new HashMap<>(); if (Optional.ofNullable(t2._2()).isPresent()) { - // estraggo le tipologie di subject dal result - Map> resultMap = new HashMap<>(); - res.getSubject().stream().forEach(s -> { - String cid = s.getQualifier().getClassid(); - if (!resultMap.containsKey(cid)) { - resultMap.put(cid, new ArrayList<>()); - } - resultMap.get(cid).add(s.getValue()); - }); + if(Optional.ofNullable(res.getSubject()).isPresent()){ + res.getSubject().stream().forEach(s -> { + String cid = s.getQualifier().getClassid(); + if(!cid.equals(ModelConstants.DNET_SUBJECT_KEYWORD)){ + if (!resultMap.containsKey(cid)) { + resultMap.put(cid, new ArrayList<>()); + } + resultMap.get(cid).add(s.getValue()); + } + }); + }else{ + res.setSubject(new ArrayList<>()); + } // Remove from the list all the subjects with the same class already present in the result List distinctClassId = t2 @@ -142,12 +154,12 @@ public class SparkSubjectPropagationStep2 implements Serializable { } } - return res; - }, Encoders.bean(resultClazz)) + return OBJECT_MAPPER.writeValueAsString(res); + }, Encoders.STRING()) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") - .json(workingPath + "/" + resultType); + .text(workingPath + "/" + resultType); readPath(spark, workingPath + "/" + resultType, resultClazz) .write() diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json index a8ec1d5b3..1e3ac1af4 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_preparesubjecttoresult_parameters.json @@ -3,7 +3,7 @@ { "paramName":"asr", - "paramLongName":"allowedSemRel", + "paramLongName":"allowedsemrels", "paramDescription": "the set of semantic relations between the results to be exploited to perform the propagation", "paramRequired": true }, diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json index 0cb51c598..76942cbe6 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/input_propagatesubject_parameters.json @@ -13,8 +13,8 @@ "paramRequired": true }, { - "paramName":"ip", - "paramLongName":"inputPath", + "paramName":"sp", + "paramLongName":"sourcePath", "paramDescription": "the path of the input graph", "paramRequired": true }, diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/config-default.xml index caf3c6050..0ce8cef58 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/config-default.xml @@ -48,7 +48,7 @@ sparkExecutorMemory - 6G + 10G sparkExecutorCores diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml index b7f48a4e0..b16a1b00f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml @@ -1,12 +1,4 @@ - - - - - - - - sourcePath @@ -16,14 +8,6 @@ subjectlist the list of subject classid to propagate (split by ;) - - resultType - the result tapy - - - resultTableName - the class of the result - allowedsemrels the allowed semantics @@ -64,14 +48,14 @@ - + yarn cluster - PrepareProjectResultsAssociation + PrepareSubjectResultsAssociation eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1 dhp-enrichment-${projectVersion}.jar @@ -98,7 +82,7 @@ yarn cluster - PrepareProjectResultsAssociation + PrepareSubjectResultsAssociation eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1 dhp-enrichment-${projectVersion}.jar @@ -125,7 +109,7 @@ yarn cluster - PrepareProjectResultsAssociation + PrepareSubjectResultsAssociation eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1 dhp-enrichment-${projectVersion}.jar @@ -152,7 +136,7 @@ yarn cluster - PrepareProjectResultsAssociation + PrepareSubjectResultsAssociation eu.dnetlib.dhp.subjecttoresultfromsemrel.PrepareResultResultStep1 dhp-enrichment-${projectVersion}.jar @@ -188,12 +172,12 @@ yarn cluster - ProjectToResultPropagation - eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + SubjectToResultPropagation + eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2 dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-memory=8G --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -201,8 +185,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.sql.shuffle.partitions=3840 - --inputPath${sourcePath} + --sourcePath${sourcePath} --outputPath${outputPath} --workingPath${workingDir}/working --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication @@ -217,8 +202,8 @@ yarn cluster - ProjectToResultPropagation - eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + SubjectToResultPropagation + eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2 dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -230,8 +215,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.sql.shuffle.partitions=3840 - --inputPath${sourcePath} + --sourcePath${sourcePath} --outputPath${outputPath} --workingPath${workingDir}/working --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct @@ -246,8 +232,8 @@ yarn cluster - ProjectToResultPropagation - eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + SubjectToResultPropagation + eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2 dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -259,8 +245,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.sql.shuffle.partitions=3840 - --inputPath${sourcePath} + --sourcePath${sourcePath} --outputPath${outputPath} --workingPath${workingDir}/working --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset @@ -275,8 +262,8 @@ yarn cluster - ProjectToResultPropagation - eu.dnetlib.dhp.projecttoresult.SparkResultToProjectThroughSemRelJob + SubjectToResultPropagation + eu.dnetlib.dhp.subjecttoresultfromsemrel.SparkSubjectPropagationStep2 dhp-enrichment-${projectVersion}.jar --executor-cores=${sparkExecutorCores} @@ -288,8 +275,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.sql.shuffle.partitions=3840 - --inputPath${sourcePath} + --sourcePath${sourcePath} --outputPath${outputPath} --workingPath${workingDir}/working --resultTableNameeu.dnetlib.dhp.schema.oaf.Software @@ -300,7 +288,7 @@ - + diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPreparationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPreparationJobTest.java index f782feab9..0b3b45d7e 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPreparationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPreparationJobTest.java @@ -81,7 +81,7 @@ public class SubjectPreparationJobTest { PrepareResultResultStep1 .main( new String[] { - "-allowedSemRel", + "-allowedsemrels", "IsSupplementedBy;IsSupplementTo;IsPreviousVersionOf;IsNewVersionOf;IsIdenticalTo;Obsoletes;IsObsoletedBy;IsVersionOf", "-subjectlist", "fos;sdg", "-resultType", "publication", diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPropagationJobTest.java index b324b49d8..48c425bbc 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPropagationJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/subjectpropagation/SubjectPropagationJobTest.java @@ -76,7 +76,7 @@ public class SubjectPropagationJobTest { .getResource("/eu/dnetlib/dhp/subjectpropagation/preparedInfo") .getPath(), "-resultType", "publication", - "-inputPath", getClass() + "-sourcePath", getClass() .getResource("/eu/dnetlib/dhp/subjectpropagation") .getPath(), "-isSparkSessionManaged", Boolean.FALSE.toString(), -- 2.17.1 From 777d0bf560ad7d0809e201d3672bfcf1c2cdd157 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Tue, 31 Jan 2023 10:29:27 +0100 Subject: [PATCH 3/3] [subjectPropagation] added default values for parameters --- .../eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml index b16a1b00f..02c015f97 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/subjectpropagation/oozie_app/workflow.xml @@ -6,10 +6,12 @@ subjectlist + fos;sdg the list of subject classid to propagate (split by ;) allowedsemrels + IsSupplementedBy;IsSupplementTo;IsPreviousVersionOf;IsNewVersionOf;IsIdenticalTo;Obsoletes;IsObsoletedBy;IsVersionOf the allowed semantics -- 2.17.1