forked from D-Net/dnet-hadoop
Merge branch 'beta' into affiliationPropagation
This commit is contained in:
commit
3974fa7dc1
|
@ -27,7 +27,10 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
public static final int ORCID_LEN = 19;
|
public static final int ORCID_LEN = 19;
|
||||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||||
public static final String TITLE_FILTER_REGEX = "(test)|\\W|\\d";
|
|
||||||
|
public static final String TITLE_TEST = "test";
|
||||||
|
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
|
||||||
|
|
||||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
||||||
|
|
||||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||||
|
@ -195,10 +198,16 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
final String title = sp
|
final String title = sp
|
||||||
.getValue()
|
.getValue()
|
||||||
.toLowerCase();
|
.toLowerCase();
|
||||||
final String residual = Unidecode
|
final String decoded = Unidecode.decode(title);
|
||||||
.decode(title)
|
|
||||||
.replaceAll(TITLE_FILTER_REGEX, "");
|
if (StringUtils.contains(decoded, TITLE_TEST)) {
|
||||||
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
return decoded
|
||||||
|
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||||
|
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||||
|
}
|
||||||
|
return !decoded
|
||||||
|
.replaceAll("\\W|\\d", "")
|
||||||
|
.isEmpty();
|
||||||
})
|
})
|
||||||
.map(GraphCleaningFunctions::cleanValue)
|
.map(GraphCleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
|
|
|
@ -14,6 +14,9 @@ import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
@ -152,4 +155,25 @@ public class DedupUtility {
|
||||||
return o1.compareTo(o2);
|
return o1.compareTo(o2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Relation createSimRel(String source, String target, String entity) {
|
||||||
|
final Relation r = new Relation();
|
||||||
|
r.setSource(source);
|
||||||
|
r.setTarget(target);
|
||||||
|
r.setSubRelType("dedupSimilarity");
|
||||||
|
r.setRelClass(ModelConstants.IS_SIMILAR_TO);
|
||||||
|
r.setDataInfo(new DataInfo());
|
||||||
|
|
||||||
|
switch (entity) {
|
||||||
|
case "result":
|
||||||
|
r.setRelType(ModelConstants.RESULT_RESULT);
|
||||||
|
break;
|
||||||
|
case "organization":
|
||||||
|
r.setRelType(ModelConstants.ORG_ORG_RELTYPE);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,7 +63,9 @@ public class SparkCopyRelationsNoOpenorgs extends AbstractSparkAction {
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.filter(x -> !isOpenorgs(x));
|
.filter(x -> !isOpenorgs(x));
|
||||||
|
|
||||||
log.info("Number of non-Openorgs relations collected: {}", simRels.count());
|
if (log.isDebugEnabled()) {
|
||||||
|
log.debug("Number of non-Openorgs relations collected: {}", simRels.count());
|
||||||
|
}
|
||||||
|
|
||||||
spark
|
spark
|
||||||
.createDataset(simRels.rdd(), Encoders.bean(Relation.class))
|
.createDataset(simRels.rdd(), Encoders.bean(Relation.class))
|
||||||
|
|
|
@ -20,6 +20,7 @@ import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
@ -102,7 +103,7 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
.createDataset(
|
.createDataset(
|
||||||
Deduper
|
Deduper
|
||||||
.computeRelations(sc, blocks, dedupConf)
|
.computeRelations(sc, blocks, dedupConf)
|
||||||
.map(t -> createSimRel(t._1(), t._2(), entity))
|
.map(t -> DedupUtility.createSimRel(t._1(), t._2(), entity))
|
||||||
.repartition(numPartitions)
|
.repartition(numPartitions)
|
||||||
.rdd(),
|
.rdd(),
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
|
@ -111,24 +112,4 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Relation createSimRel(String source, String target, String entity) {
|
|
||||||
final Relation r = new Relation();
|
|
||||||
r.setSource(source);
|
|
||||||
r.setTarget(target);
|
|
||||||
r.setSubRelType("dedupSimilarity");
|
|
||||||
r.setRelClass("isSimilarTo");
|
|
||||||
r.setDataInfo(new DataInfo());
|
|
||||||
|
|
||||||
switch (entity) {
|
|
||||||
case "result":
|
|
||||||
r.setRelType("resultResult");
|
|
||||||
break;
|
|
||||||
case "organization":
|
|
||||||
r.setRelType("organizationOrganization");
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -124,31 +124,12 @@ public class SparkWhitelistSimRels extends AbstractSparkAction {
|
||||||
|
|
||||||
Dataset<Relation> whiteListSimRels = whiteListRels2
|
Dataset<Relation> whiteListSimRels = whiteListRels2
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<String, String>, Relation>) r -> createSimRel(r._1(), r._2(), entity),
|
(MapFunction<Tuple2<String, String>, Relation>) r -> DedupUtility
|
||||||
|
.createSimRel(r._1(), r._2(), entity),
|
||||||
Encoders.bean(Relation.class));
|
Encoders.bean(Relation.class));
|
||||||
|
|
||||||
saveParquet(whiteListSimRels, outputPath, SaveMode.Append);
|
saveParquet(whiteListSimRels, outputPath, SaveMode.Append);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Relation createSimRel(String source, String target, String entity) {
|
|
||||||
final Relation r = new Relation();
|
|
||||||
r.setSource(source);
|
|
||||||
r.setTarget(target);
|
|
||||||
r.setSubRelType("dedupSimilarity");
|
|
||||||
r.setRelClass("isSimilarTo");
|
|
||||||
r.setDataInfo(new DataInfo());
|
|
||||||
|
|
||||||
switch (entity) {
|
|
||||||
case "result":
|
|
||||||
r.setRelType("resultResult");
|
|
||||||
break;
|
|
||||||
case "organization":
|
|
||||||
r.setRelType("organizationOrganization");
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,15 +43,6 @@ public class ResultMapper implements Serializable {
|
||||||
try {
|
try {
|
||||||
|
|
||||||
addTypeSpecificInformation(out, input, ort);
|
addTypeSpecificInformation(out, input, ort);
|
||||||
Optional<List<Measure>> mes = Optional.ofNullable(input.getMeasures());
|
|
||||||
if (mes.isPresent()) {
|
|
||||||
List<KeyValue> measure = new ArrayList<>();
|
|
||||||
mes
|
|
||||||
.get()
|
|
||||||
.forEach(
|
|
||||||
m -> m.getUnit().forEach(u -> measure.add(KeyValue.newInstance(m.getId(), u.getValue()))));
|
|
||||||
out.setMeasures(measure);
|
|
||||||
}
|
|
||||||
|
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(input.getAuthor())
|
.ofNullable(input.getAuthor())
|
||||||
|
|
|
@ -8,6 +8,7 @@ import java.io.IOException;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -137,9 +138,21 @@ public class GraphCleaningFunctionsTest {
|
||||||
.stream()
|
.stream()
|
||||||
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||||
|
|
||||||
|
assertEquals(5, p_out.getTitle().size());
|
||||||
|
|
||||||
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
|
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
|
||||||
|
|
||||||
assertEquals(1, p_cleaned.getTitle().size());
|
assertEquals(3, p_cleaned.getTitle().size());
|
||||||
|
|
||||||
|
List<String> titles = p_cleaned
|
||||||
|
.getTitle()
|
||||||
|
.stream()
|
||||||
|
.map(StructuredProperty::getValue)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
assertTrue(titles.contains("omic"));
|
||||||
|
assertTrue(
|
||||||
|
titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
|
||||||
|
assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"));
|
||||||
|
|
||||||
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
||||||
assertNull(p_out.getPublisher());
|
assertNull(p_out.getPublisher());
|
||||||
|
|
|
@ -52,8 +52,6 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generateUpdates(spark:SparkSession):Unit = {
|
def generateUpdates(spark:SparkSession):Unit = {
|
||||||
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
|
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
|
||||||
|
|
||||||
|
@ -152,7 +150,6 @@ class ResolveEntitiesTest extends Serializable {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
|
val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication))
|
||||||
val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count()
|
||||||
|
|
||||||
|
|
|
@ -864,7 +864,7 @@
|
||||||
"schemeid": "dnet:dataCite_title",
|
"schemeid": "dnet:dataCite_title",
|
||||||
"schemename": "dnet:dataCite_title"
|
"schemename": "dnet:dataCite_title"
|
||||||
},
|
},
|
||||||
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
|
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers test"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"dataInfo": {
|
"dataInfo": {
|
||||||
|
@ -887,6 +887,72 @@
|
||||||
"schemename": "dnet:dataCite_title"
|
"schemename": "dnet:dataCite_title"
|
||||||
},
|
},
|
||||||
"value": "test test 123 test"
|
"value": "test test 123 test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "main title",
|
||||||
|
"classname": "main title",
|
||||||
|
"schemeid": "dnet:dataCite_title",
|
||||||
|
"schemename": "dnet:dataCite_title"
|
||||||
|
},
|
||||||
|
"value": "omic"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "main title",
|
||||||
|
"classname": "main title",
|
||||||
|
"schemeid": "dnet:dataCite_title",
|
||||||
|
"schemename": "dnet:dataCite_title"
|
||||||
|
},
|
||||||
|
"value": "「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "main title",
|
||||||
|
"classname": "main title",
|
||||||
|
"schemeid": "dnet:dataCite_title",
|
||||||
|
"schemename": "dnet:dataCite_title"
|
||||||
|
},
|
||||||
|
"value": "-"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
File diff suppressed because one or more lines are too long
|
@ -11,7 +11,7 @@ create view if not exists TARGET.funder as select * from SOURCE.funder;
|
||||||
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
create view if not exists TARGET.fundref as select * from SOURCE.fundref;
|
||||||
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
create view if not exists TARGET.rndexpenditure as select * from SOURCE.rndexpediture;
|
||||||
|
|
||||||
create table TARGET.result as
|
create table TARGET.result stored as parquet as
|
||||||
select distinct * from (
|
select distinct * from (
|
||||||
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
select * from SOURCE.result r where exists (select 1 from SOURCE.result_projects rp join SOURCE.project p on rp.project=p.id where rp.id=r.id)
|
||||||
union all
|
union all
|
||||||
|
@ -21,61 +21,62 @@ create table TARGET.result as
|
||||||
'openorgs____::759d59f05d77188faee99b7493b46805',
|
'openorgs____::759d59f05d77188faee99b7493b46805',
|
||||||
'openorgs____::b84450f9864182c67b8611b5593f4250',
|
'openorgs____::b84450f9864182c67b8611b5593f4250',
|
||||||
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975',
|
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975',
|
||||||
'openorgs____::eadc8da90a546e98c03f896661a2e4d4') )) foo;
|
'openorgs____::eadc8da90a546e98c03f896661a2e4d4',
|
||||||
|
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2') )) foo;
|
||||||
compute stats TARGET.result;
|
compute stats TARGET.result;
|
||||||
|
|
||||||
create table TARGET.result_citations as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_citations;
|
compute stats TARGET.result_citations;
|
||||||
|
|
||||||
create table TARGET.result_classifications as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_classifications;
|
compute stats TARGET.result_classifications;
|
||||||
|
|
||||||
create table TARGET.result_concepts as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_concepts;
|
compute stats TARGET.result_concepts;
|
||||||
|
|
||||||
create table TARGET.result_datasources as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_datasources stored as parquet as select * from SOURCE.result_datasources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_datasources;
|
compute stats TARGET.result_datasources;
|
||||||
|
|
||||||
create table TARGET.result_fundercount as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_fundercount stored as parquet as select * from SOURCE.result_fundercount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_fundercount;
|
compute stats TARGET.result_fundercount;
|
||||||
|
|
||||||
create table TARGET.result_gold as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_gold stored as parquet as select * from SOURCE.result_gold orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_gold;
|
compute stats TARGET.result_gold;
|
||||||
|
|
||||||
create table TARGET.result_greenoa as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_greenoa stored as parquet as select * from SOURCE.result_greenoa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_greenoa;
|
compute stats TARGET.result_greenoa;
|
||||||
|
|
||||||
create table TARGET.result_languages as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_languages stored as parquet as select * from SOURCE.result_languages orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_languages;
|
compute stats TARGET.result_languages;
|
||||||
|
|
||||||
create table TARGET.result_licences as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_licences stored as parquet as select * from SOURCE.result_licenses orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_licences;
|
compute stats TARGET.result_licences;
|
||||||
|
|
||||||
create table TARGET.result_oids as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_oids stored as parquet as select * from SOURCE.result_oids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_oids;
|
compute stats TARGET.result_oids;
|
||||||
|
|
||||||
create table TARGET.result_organization as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_organization stored as parquet as select * from SOURCE.result_organization orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_organization;
|
compute stats TARGET.result_organization;
|
||||||
|
|
||||||
create table TARGET.result_peerreviewed as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_peerreviewed stored as parquet as select * from SOURCE.result_peerreviewed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_peerreviewed;
|
compute stats TARGET.result_peerreviewed;
|
||||||
|
|
||||||
create table TARGET.result_pids as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_pids stored as parquet as select * from SOURCE.result_pids orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_pids;
|
compute stats TARGET.result_pids;
|
||||||
|
|
||||||
create table TARGET.result_projectcount as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_projectcount stored as parquet as select * from SOURCE.result_projectcount orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_projectcount;
|
compute stats TARGET.result_projectcount;
|
||||||
|
|
||||||
create table TARGET.result_projects as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_projects stored as parquet as select * from SOURCE.result_projects orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_projects;
|
compute stats TARGET.result_projects;
|
||||||
|
|
||||||
create table TARGET.result_refereed as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_refereed stored as parquet as select * from SOURCE.result_refereed orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_refereed;
|
compute stats TARGET.result_refereed;
|
||||||
|
|
||||||
create table TARGET.result_sources as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_sources stored as parquet as select * from SOURCE.result_sources orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_sources;
|
compute stats TARGET.result_sources;
|
||||||
|
|
||||||
create table TARGET.result_topics as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.result_topics;
|
compute stats TARGET.result_topics;
|
||||||
|
|
||||||
-- datasources
|
-- datasources
|
||||||
|
@ -84,7 +85,7 @@ create view if not exists TARGET.datasource_oids as select * from SOURCE.datasou
|
||||||
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
|
create view if not exists TARGET.datasource_organizations as select * from SOURCE.datasource_organizations;
|
||||||
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
|
create view if not exists TARGET.datasource_sources as select * from SOURCE.datasource_sources;
|
||||||
|
|
||||||
create table TARGET.datasource_results as select id as result, datasource as id from TARGET.result_datasources;
|
create table TARGET.datasource_results stored as parquet as select id as result, datasource as id from TARGET.result_datasources;
|
||||||
compute stats TARGET.datasource_results;
|
compute stats TARGET.datasource_results;
|
||||||
|
|
||||||
-- organizations
|
-- organizations
|
||||||
|
@ -100,7 +101,7 @@ create view if not exists TARGET.project_oids as select * from SOURCE.project_oi
|
||||||
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
|
create view if not exists TARGET.project_organizations as select * from SOURCE.project_organizations;
|
||||||
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
|
create view if not exists TARGET.project_resultcount as select * from SOURCE.project_resultcount;
|
||||||
|
|
||||||
create table TARGET.project_results as select id as result, project as id from TARGET.result_projects;
|
create table TARGET.project_results stored as parquet as select id as result, project as id from TARGET.result_projects;
|
||||||
compute stats TARGET.project_results;
|
compute stats TARGET.project_results;
|
||||||
|
|
||||||
-- indicators
|
-- indicators
|
||||||
|
@ -121,19 +122,19 @@ create view TARGET.indi_pub_avg_year_content_oa as select * from SOURCE.indi_pub
|
||||||
create view TARGET.indi_pub_avg_year_context_oa as select * from SOURCE.indi_pub_avg_year_context_oa orig;
|
create view TARGET.indi_pub_avg_year_context_oa as select * from SOURCE.indi_pub_avg_year_context_oa orig;
|
||||||
create view TARGET.indi_pub_avg_year_country_oa as select * from SOURCE.indi_pub_avg_year_country_oa orig;
|
create view TARGET.indi_pub_avg_year_country_oa as select * from SOURCE.indi_pub_avg_year_country_oa orig;
|
||||||
|
|
||||||
create table TARGET.indi_pub_green_oa as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.indi_pub_green_oa stored as parquet as select * from SOURCE.indi_pub_green_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.indi_pub_green_oa;
|
compute stats TARGET.indi_pub_green_oa;
|
||||||
create table TARGET.indi_pub_grey_lit as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.indi_pub_grey_lit stored as parquet as select * from SOURCE.indi_pub_grey_lit orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.indi_pub_grey_lit;
|
compute stats TARGET.indi_pub_grey_lit;
|
||||||
create table TARGET.indi_pub_doi_from_crossref as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * from SOURCE.indi_pub_doi_from_crossref orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.indi_pub_doi_from_crossref;
|
compute stats TARGET.indi_pub_doi_from_crossref;
|
||||||
create table TARGET.indi_pub_gold_oa as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.indi_pub_gold_oa;
|
compute stats TARGET.indi_pub_gold_oa;
|
||||||
create table TARGET.indi_pub_has_abstract as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.indi_pub_has_abstract;
|
compute stats TARGET.indi_pub_has_abstract;
|
||||||
create table TARGET.indi_pub_has_cc_licence as select * from SOURCE.indi_pub_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.indi_pub_has_cc_licence stored as parquet as select * from SOURCE.indi_pub_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.indi_pub_has_cc_licence;
|
compute stats TARGET.indi_pub_has_cc_licence;
|
||||||
create table TARGET.indi_pub_has_cc_licence_url as select * from SOURCE.indi_pub_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
create table TARGET.indi_pub_has_cc_licence_url stored as parquet as select * from SOURCE.indi_pub_has_cc_licence_url orig where exists (select 1 from TARGET.result r where r.id=orig.id);
|
||||||
compute stats TARGET.indi_pub_has_cc_licence_url;
|
compute stats TARGET.indi_pub_has_cc_licence_url;
|
||||||
|
|
||||||
create view TARGET.indi_software_avg_year_content_oa as select * from SOURCE.indi_software_avg_year_content_oa orig;
|
create view TARGET.indi_software_avg_year_content_oa as select * from SOURCE.indi_software_avg_year_content_oa orig;
|
||||||
|
@ -143,15 +144,15 @@ create view TARGET.indi_software_avg_year_country_oa as select * from SOURCE.ind
|
||||||
--denorm
|
--denorm
|
||||||
alter table TARGET.result rename to TARGET.res_tmp;
|
alter table TARGET.result rename to TARGET.res_tmp;
|
||||||
|
|
||||||
create table TARGET.result_denorm as
|
create table TARGET.result_denorm stored as parquet as
|
||||||
select distinct r.*, rp.project, p.acronym as pacronym, p.title as ptitle, p.funder as pfunder, p.funding_lvl0 as pfunding_lvl0, rd.datasource, d.name as dname, d.type as dtype
|
select distinct r.*, rp.project, p.acronym as pacronym, p.title as ptitle, p.funder as pfunder, p.funding_lvl0 as pfunding_lvl0, rd.datasource, d.name as dname, d.type as dtype
|
||||||
from TARGET.res_tmp r
|
from TARGET.res_tmp r
|
||||||
join TARGET.result_projects rp on rp.id=r.id
|
left outer join TARGET.result_projects rp on rp.id=r.id
|
||||||
join TARGET.result_datasources rd on rd.id=r.id
|
left outer join TARGET.result_datasources rd on rd.id=r.id
|
||||||
join TARGET.project p on p.id=rp.project
|
left outer join TARGET.project p on p.id=rp.project
|
||||||
join TARGET.datasource d on d.id=rd.datasource;
|
left outer join TARGET.datasource d on d.id=rd.datasource;
|
||||||
compute stats TARGET.result_denorm;
|
compute stats TARGET.result_denorm;
|
||||||
|
|
||||||
alter table TARGET.result_denorm rename to TARGET.result;
|
alter table TARGET.result_denorm rename to TARGET.result;
|
||||||
drop table TARGET.res_tmp;
|
drop table TARGET.res_tmp;
|
||||||
--- done!
|
--- done!
|
|
@ -126,7 +126,7 @@ FROM ${stats_db_name}.otherresearchproduct_topics;
|
||||||
CREATE TABLE ${stats_db_name}.result_organization AS
|
CREATE TABLE ${stats_db_name}.result_organization AS
|
||||||
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
|
||||||
FROM ${openaire_db_name}.relation r
|
FROM ${openaire_db_name}.relation r
|
||||||
WHERE r.reltype = 'resultOrganization' and r.relclass='hasAuthorInstitution' and r.subreltype='affiliation'
|
WHERE r.reltype = 'resultOrganization'
|
||||||
and r.datainfo.deletedbyinference = false;
|
and r.datainfo.deletedbyinference = false;
|
||||||
|
|
||||||
CREATE TABLE ${stats_db_name}.result_projects AS
|
CREATE TABLE ${stats_db_name}.result_projects AS
|
||||||
|
|
Loading…
Reference in New Issue