Issue
I have a dataframe with a column of lists ('value') and a column with the count of items in the respective list ('no_of_values') sorted descendingly by the no_of_values.
df=pd.DataFrame({'value':[['AB','BC','CD','DE','EF','FG','GH','HI'],
['BC','CD','DE','IJ','JK','KL','LM'],
['AB','CD','DE','IJ','JK','GH','HI'],
['AB','CD','DE','MN'],
['C', 'D', 'M'],
['MN','NO'],
['APQ']],
'no_of_values': [8,7,7,4,3,2,1]})
Now I want to count how many items in the value lists, going from the highest no_of_values to the lowest.
I tried:
df['no_of_1st_occurence'] = (~df['value'].explode().duplicated()).groupby(level=0).sum()
Which results in:
value no_of_values no_of_1st_occurence
0 [AB, BC, CD, DE, EF, FG, GH, HI] 8 8
1 [BC, CD, DE, IJ, JK, KL, LM] 7 4
2 [AB, CD, DE, IJ, JK, GH, HI] 7 0
3 [AB, CD, DE, MN] 4 1
4 [C, D, M] 3 3
5 [MN, NO] 2 1
6 [APQ] 1 1
But this does not work, when a no_of_values occurs for more than one time (here: "7"). I would like to split the count of 1st_occurences in that case into strict no_of_1st_occurence (if an item in the value list did not occur for a higher no_of_values nor the same no_of_values) and shared_1st_occurences (if an item in the value list did not occur for a higher no_of_values, but occurs atleast another times for the same no_of_values).
The desired outcome would be:
value no_of_values | strict_1st_occurence | shared_1st_occurence
0 [AB, BC, CD, DE, EF, FG, GH, HI] 8 8 0
1 [BC, CD, DE, IJ, JK, KL, LM] 7 2 2
2 [AB, CD, DE, IJ, JK, GH, HI] 7 0 2
3 [AB, CD, DE, MN] 4 1 0
4 [C, D, M] 3 3 0
5 [MN, NO] 2 1 0
6 [APQ] 1 1 0
Solution
Probably way too complicated, but I got it working and came to the desired outcome:
def remove_duplicates_from_list(x):
return list(dict.fromkeys(x))
def get_duplicates_from_list(L):
seen = set()
seen2 = set()
seen_add = seen.add
seen2_add = seen2.add
for item in L:
if item in seen:
seen2_add(item)
else:
seen_add(item)
return list(seen2)
def get_intersection_from_two_lists(x,y):
return list(set(x).intersection(y))
import itertools
import pandas as pd
df=pd.DataFrame({'value':[['AB','BC','CD','DE','EF','FG','GH','HI'],
['BC','CD','DE','IJ','JK','KL','LM'],
['AB','CD','DE','IJ','JK','GH','HI'],
['AB','CD','DE','MN'],
['C', 'D', 'M'],
['MN','NO'],
['APQ']],
'no_of_values': [8,7,7,4,3,2,1]})
list_of_unique_no_of_values=[]
for value_no in df.no_of_values:
if value_no not in list_of_unique_no_of_values:
list_of_unique_no_of_values.append(value_no)
df['values_of_same_no_of_values'] = ""
df['new_value'] = ""
value_list=[]
for unique_value_no in list_of_unique_no_of_values:
mask=(df['no_of_values']==unique_value_no)
for k in df.loc[mask,'value']:
for i in k:
df.loc[mask,'values_of_same_no_of_values']=df.loc[mask,'values_of_same_no_of_values']+i+","
if i not in value_list:
value_list.append(i)
df.loc[mask,'new_value']=df.loc[mask,'new_value']+i+","
df['values_of_same_no_of_values'] = df['values_of_same_no_of_values'].str[:-1]
df['values_of_same_no_of_values'] = df['values_of_same_no_of_values'].str.split(',')
df['values_of_same_no_of_values'] = df['values_of_same_no_of_values'].apply (lambda row: remove_duplicates_from_list(row))
df['new_value'] = df['new_value'].str[:-1]
df['new_value'] = df['new_value'].str.split(',')
df['new_value'] = df['new_value'].apply (lambda row: remove_duplicates_from_list(row))
df['intersection_value_new_value'] = [list(set(a).intersection(b)) for a, b in zip(df.value, df.new_value)]
df['len_intersection_value_new_value'] = df['intersection_value_new_value'].str.len()
list_of_all_intersection_values=list(itertools.chain.from_iterable(df['intersection_value_new_value'].tolist()))
shared_1st_occurence_values=get_duplicates_from_list(list_of_all_intersection_values)
df['shared_1st_occurence_values'] = df['intersection_value_new_value'].apply (lambda row: get_intersection_from_two_lists(row, shared_1st_occurence_values))
df['strict_1st_occurence_values'] = (df['intersection_value_new_value'].map(set) - df['shared_1st_occurence_values'].map(set)).map(list)
df['shared_1st_occurence'] = df['shared_1st_occurence_values'].str.len()
df['strict_1st_occurence'] = df['len_intersection_value_new_value']-df['shared_1st_occurence']
df = df[['value','no_of_values','strict_1st_occurence','shared_1st_occurence']]
df
Answered By - user18334962
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.