dnet-applications/apps/dhp-mdstore-manager/src/main/resources/zeppelin/analyzeTypes.py

24 lines
646 B
Python

%pyspark
from pyspark.sql.types import *
from pyspark.sql.functions import *
from lxml import etree
from datetime import datetime
@udf(ArrayType(StringType()))
def get_type(record):
root = etree.fromstring(record.encode('utf-8'))
r = root.xpath("//*[local-name()='resourceType' and./@resourceTypeGeneral='Other']")
c_types = []
for item in r:
c_types.append(item.text)
return c_types
df = spark.read.load(path)
types = df.select(df.id, explode(get_type(df.body)).alias('type')).groupBy('type').agg(count(df.id).alias('cnt')).collect()
print "%table"
print "type\tcount"
for item in types:
print "{}\t{}".format(item.type, item.cnt)