Issue
I want to get columns from 2 other tables to update in "a" table. This is like the mysql update statement -
UPDATE bucket_summary a,geo_count b, geo_state c
SET a.category_name=b.county_name,
a.state_code=c.state_code
WHERE a.category_id=b.county_geoid
AND b.state_fips=c.state_fips
AND a.category='county'
How to write this?
condition = [a.category_id=b.county_geoid, b.state_fips=c.state_fips, a.category='county']
df_a = df_a.join([df_b, df_c], condition, how= left)
is not working for me
Solution
import pyspark.sql.functions as f
########
# data
########
df_a = sc.parallelize([
[None, None, '123', 'country'],
['sc2', 'cn2', '234', 'state'],
['sc3', 'cn3', '456', 'country']
]).toDF(('state_code', 'category_name', 'category_id', 'category'))
df_a.show()
df_b = sc.parallelize([
['789','United States', 'asdf'],
['234','California', 'abc'],
['456','United Kingdom','xyz']
]).toDF(('county_geoid', 'country_name', 'state_fips'))
df_c = sc.parallelize([
['US','asdf'],
['CA','abc'],
['UK','xyz']
]).toDF(('state_code', 'state_fips'))
df_c = df_c.select(*(f.col(x).alias(x + '_df_c') for x in df_c.columns))
########
# update df_a with values from df_b & df_c
########
df_temp = df_a.join(df_b, [df_a.category_id == df_b.county_geoid, df_a.category=='country'], 'left').drop('county_geoid')
df_temp = df_temp.withColumn('category_name_new',
f.when(df_temp.country_name.isNull(), df_temp.category_name).
otherwise(df_temp.country_name)).drop('category_name','country_name').\
withColumnRenamed('category_name_new','category_name')
df_a = df_temp.join(df_c,[df_temp.state_fips == df_c.state_fips_df_c, df_temp.category=='country'], 'left').drop('state_fips_df_c','state_fips')
df_a = df_a.withColumn('state_code_new',
f.when(df_a.state_code_df_c.isNull(), df_a.state_code).
otherwise(df_a.state_code_df_c)).drop('state_code_df_c','state_code').\
withColumnRenamed('state_code_new','state_code')
df_a.show()
Original df_a:
+----------+-------------+-----------+--------+
|state_code|category_name|category_id|category|
+----------+-------------+-----------+--------+
| null| null| 123| country|
| sc2| cn2| 234| state|
| sc3| cn3| 456| country|
+----------+-------------+-----------+--------+
Output i.e. final df_a:
+-----------+--------+--------------+----------+
|category_id|category| category_name|state_code|
+-----------+--------+--------------+----------+
| 234| state| cn2| sc2|
| 123| country| null| null|
| 456| country|United Kingdom| UK|
+-----------+--------+--------------+----------+
Answered By - 1.618
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.