24 lines
646 B
Python
24 lines
646 B
Python
%pyspark
|
|
from pyspark.sql.types import *
|
|
from pyspark.sql.functions import *
|
|
from lxml import etree
|
|
from datetime import datetime
|
|
|
|
@udf(ArrayType(StringType()))
|
|
def get_type(record):
|
|
root = etree.fromstring(record.encode('utf-8'))
|
|
r = root.xpath("//*[local-name()='resourceType' and./@resourceTypeGeneral='Other']")
|
|
c_types = []
|
|
for item in r:
|
|
c_types.append(item.text)
|
|
return c_types
|
|
|
|
|
|
df = spark.read.load(path)
|
|
types = df.select(df.id, explode(get_type(df.body)).alias('type')).groupBy('type').agg(count(df.id).alias('cnt')).collect()
|
|
|
|
print "%table"
|
|
print "type\tcount"
|
|
for item in types:
|
|
print "{}\t{}".format(item.type, item.cnt)
|