Issue
I haven't seen something asked like this, so I write some code like:
import pandas as pd
import pyodbc
from collections import Counter
import itertools
server = ""
cnxn = ""
query = ("")
try:
df = pd.read_sql(query, cnxn).astype('string')
except:
print("query failed")
else:
cnxn.close()
which returns a dataframe like this:
partNo planStatus planRev operation
------ ---------- ------- ---------
110068 Released 2A 0100-00-0
110068 Released 2A 0200-00-0
110383 Released 3B 0100-00-0
110383 Released 3B 0200-00-0
110384 In Dev 1C 0100-00-0
110384 In Dev 1C 0200-00-0
so now I want to add a row for each part number that has an operation '000':
dfNums= list(df['partNo'].drop_duplicates())
temp = pd.DataFrame({'partNo':dfNums, 'operation':['000']*len(dfNums)})
df = pd.concat([df, temp]).sort_values(by=['partNo', 'operation'])
which returns a dataframe like this:
partNo planStatus planRev operation
------ ---------- ------- ---------
110068 000
110068 Released 2A 0100-00-0
110068 Released 2A 0200-00-0
110383 000
110383 Released 3B 0100-00-0
110383 Released 3B 0200-00-0
110384 000
110384 In Dev 1C 0100-00-0
110384 In Dev 1C 0200-00-0
So now in order to get planStatus and planRev to populate to the '000' operation row, the best way I could think of was:
for num in dfNums:
getNumRevs = list(df.loc[df['partNo'] == num]['planRev'])
getNumStatus = list(df.loc[df['partNo'] == num]['planStatus'])
data = Counter(getNumRevs)
data1 = Counter(getNumStatus)
mostCommonRev = max(getNumRevs, key=data.get)
mostCommonStatus = max(getNumStatus, key=data1.get)
df.loc[df['partNo'] == num, 'planRev'] = mostCommonRev
df.loc[df['partNo'] == num, 'planStatus'] = ""
df.loc[(df['partNo'] == num) & (df['operation'] == '000'), 'planStatus'] = mostCommonStatus
Which I can't imagine is anywhere near the most efficient way to do this. Is there a better way to do this using groupby
? Or is there just any better way to do this? This just gives me a gross feeling of iterating through dataframes but it's the only way I could achieve the output I want, which looks like:
partNo planStatus planRev operation
------ ---------- ------- ---------
110068 Released 2A 000
110068 2A 0100-00-0
110068 2A 0200-00-0
110383 Released 3B 000
110383 3B 0100-00-0
110383 3B 0200-00-0
110384 In Dev 1C 000
110384 1C 0100-00-0
110384 1C 0200-00-0
Edit for @rayad's comment: This is similar to what @rayad's comment was leading to... I did something relatively similar that seemed faster, I never did test the speed though.
# create a temp df of all part nums and operation 000
routingNums = list(df['partNo'].drop_duplicates())
temp = pd.DataFrame({'partNo':routingNums, 'operation':['000']*len(routingNums)})
# add the temp df of '000' ops to the main df and sort
df = pd.concat([df, temp]).sort_values(by=['partNo', 'operation']).reset_index(drop=True)
# make all cimxDatabase values WLCAPP
df.loc[df['operation'] == '000', 'cimxDatabase'] = 'WLCAPP'
# make all '000' op rows have the same planRev and planStatus as the rest of the partNo's associated
check = df[['partNo','planRev','planStatus']].drop_duplicates(subset='partNo', keep='last')
df_to_merge = df[['partNo']].merge(check, on='partNo', how='left')
df.update(df_to_merge, overwrite=True)
Solution
With the first dataframe you provided:
import pandas as pd
df = pd.DataFrame([
{"partNo": 110068,"planStatus": "Released", "planRev": "2A", "operation": "0100-00-0",},
{"partNo": 110068, "planStatus": "Released", "planRev": "2A", "operation": "0200-00-0",},
{"partNo": 110383, "planStatus": "Released", "planRev": "3B", "operation": "0100-00-0",},
{"partNo": 110383, "planStatus": "Released", "planRev": "3B", "operation": "0200-00-0",},
{"partNo": 110384, "planStatus": "In Dev", "planRev": "1C", "operation": "0100-00-0",},
{"partNo": 110384, "planStatus": "In Dev", "planRev": "1C", "operation": "0200-00-0",},
])
Here is another way to do it:
# Create and add new rows
new_rows = [
pd.DataFrame(
{
"partNo": [partno],
"planStatus": [pd.NA],
"planRev": [pd.NA],
"operation": ["000"],
}
)
for partno in df["partNo"].unique()
]
df = (
pd.concat([df, *new_rows])
.sort_values(by=["partNo", "operation"])
.fillna(method="bfill")
.reset_index(drop=True)
)
# Remove duplicated values in `planStatus` column
df.loc[df["operation"] != "000", "planStatus"] = ""
And so:
print(df)
# Output
partNo planStatus planRev operation
0 110068 Released 2A 000
1 110068 2A 0100-00-0
2 110068 2A 0200-00-0
3 110383 Released 3B 000
4 110383 3B 0100-00-0
5 110383 3B 0200-00-0
6 110384 In Dev 1C 000
7 110384 1C 0100-00-0
8 110384 1C 0200-00-0
Answered By - Laurent
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.