forked from D-Net/dnet-hadoop
[Person] remove the isolated nodes from the person set
This commit is contained in:
parent
32f444984e
commit
1fce7d5a0f
|
@ -16,10 +16,8 @@ import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -84,9 +82,33 @@ public class SparkExtractPersonRelations {
|
||||||
spark,
|
spark,
|
||||||
sourcePath,
|
sourcePath,
|
||||||
workingPath);
|
workingPath);
|
||||||
|
removeIsolatedPerson(spark,sourcePath, workingPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void removeIsolatedPerson(SparkSession spark, String sourcePath, String workingPath) {
|
||||||
|
Dataset<Person> personDataset = spark.read().schema(Encoders.bean(Person.class).schema())
|
||||||
|
.json(sourcePath + "person")
|
||||||
|
.as(Encoders.bean(Person.class));
|
||||||
|
|
||||||
|
Dataset<Relation> relationDataset = spark.read().schema(Encoders.bean(Relation.class).schema())
|
||||||
|
.json(sourcePath + "relation")
|
||||||
|
.as(Encoders.bean(Relation.class));
|
||||||
|
|
||||||
|
personDataset.join(relationDataset, personDataset.col("id").equalTo(relationDataset.col("source")), "left_semi")
|
||||||
|
.write()
|
||||||
|
.option("compression","gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(workingPath + "person");
|
||||||
|
|
||||||
|
spark.read().schema(Encoders.bean(Person.class).schema())
|
||||||
|
.json(workingPath + "person")
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression","gzip")
|
||||||
|
.json(sourcePath + "person");
|
||||||
|
}
|
||||||
|
|
||||||
private static void extractRelations(SparkSession spark, String sourcePath, String workingPath) {
|
private static void extractRelations(SparkSession spark, String sourcePath, String workingPath) {
|
||||||
|
|
||||||
Dataset<Tuple2<String, Relation>> relationDataset = spark
|
Dataset<Tuple2<String, Relation>> relationDataset = spark
|
||||||
|
|
Loading…
Reference in New Issue