Issue
I have an initial dataframe with a random list of target cells (each cell being identified with a couple (x,y)). I want to yield four different dataframes each containing the value of one of the contour (surrounding) cells of each specified target cell.
the surrounding cells to consider for a specific target cell are : (x-1,y), (x,y-1),(x+1,y);(x,y+1), specifically I randomly choose 1 to 4 cells from these and consider for replacement to the target cell.
I want to do that through a pandas-specific approach without having to define the contour cells separately and then apply the changes on the dataframe (but rather using an all in one approach). I think using a Pandas (or a Numpy) approach would optimize the execution of the program.
Target cells could be up to 10% the size of the dataset, a good example to start with would have from 10 to 100 values.
for now I have written this example which I think is not Pandas specific:
from numpy import random
import pandas as pd
import numpy as np
import operator
import math
from collections import deque
from queue import *
from queue import Queue
from itertools import product
def select_target_values(dataframe, number_of_target_values):
target_cells = []
for _ in range(number_of_target_values):
row_x = random.randint(0, len(dataframe.columns) - 1)
col_y = random.randint(0, len(dataframe) - 1)
target_cells.append((row_x, col_y))
return target_cells
def select_contours(target_cells):
contour_coordinates = [(0, 1), (1, 0), (0, -1), (-1, 0)]
contour_cells = []
for target_cell in target_cells:
# random contour count for each cell
contour_cells_count = random.randint(1, 4)
try:
contour_cells.append(
[
tuple(
map(
lambda i, j: i + j,
(target_cell[0], target_cell[1]),
contour_coordinates[iteration_],
)
)
for iteration_ in range(contour_cells_count)
]
)
except IndexError:
continue
return contour_cells
def create_zipf_distribution():
zipf_dist = random.zipf(2, size=(50, 5)).reshape((50, 5))
zipf_distribution_dataset = pd.DataFrame(zipf_dist).round(3)
return zipf_distribution_dataset
def apply_contours(target_cells, contour_cells):
target_cells_with_contour = []
# create one single list of cells
for idx, target_cell in enumerate(target_cells):
target_cell_with_contour = [target_cell]
target_cell_with_contour.extend(contour_cells[idx])
target_cells_with_contour.append(target_cell_with_contour)
return target_cells_with_contour
def create_possible_datasets(dataframe, target_cells_with_contour):
all_datasets_final = []
dataframe_original = dataframe.copy()
list_tuples_idx_cells_all_datasets = list(
filter(
lambda x: x,
[list(tuples) for tuples in list(product(*target_cells_with_contour))],
)
)
target_original_cells_coordinates = list(
map(
lambda x: x[0],
[
target_and_contour_cell
for target_and_contour_cell in target_cells_with_contour
],
)
)
for dataset_index_values in list_tuples_idx_cells_all_datasets:
all_datasets = []
for idx_cell in range(len(dataset_index_values)):
dataframe_cpy = dataframe.copy()
dataframe_cpy.iat[
target_original_cells_coordinates[idx_cell][1],
target_original_cells_coordinates[idx_cell][0],
] = dataframe_original.iloc[
dataset_index_values[idx_cell][1], dataset_index_values[idx_cell][0]
]
all_datasets.append(dataframe_cpy)
all_datasets_final.append(all_datasets)
return all_datasets_final
def main():
zipf_dataset = create_zipf_distribution()
target_cells = select_target_values(zipf_dataset, 5)
print(target_cells)
contour_cells = select_contours(target_cells)
print(contour_cells)
target_cells_with_contour = apply_contours(target_cells, contour_cells)
datasets = create_possible_datasets(zipf_dataset, target_cells_with_contour)
print(datasets)
main()
If you have a better Pandas approach (unifying all these methods into one that make use of dataframe methods only) please let me know.
Solution
This is not a Pandas question; it's a Numpy question. Your code is pulled inappropriately in the direction of Pandas, and also inappropriately in the direction of pure Python iterations; neither should be used when base Numpy is available. The following has refactored everything somewhat heavily except create_possible_datasets
which needs more work. Your use of product
creates a size explosion and it's unclear what you're doing there or why that is necessary.
import numpy as np
from itertools import product
def select_target_values(
dataset: np.ndarray, n_target_values: int, rand: np.random.Generator,
) -> np.ndarray:
return rand.integers(dataset.shape, size=(n_target_values, 2))
def select_contours(
target_cells: np.ndarray, rand: np.random.Generator,
) -> list[np.ndarray]:
contour_coordinates = ((0, 1), (1, 0), (0, -1), (-1, 0))
# Use a list because the output is jagged
contour_cells = []
for target_cell, n_contour_cells in zip(
target_cells,
rand.integers(
low=1, high=1 + len(contour_coordinates),
size=target_cells.shape[0],
),
):
# For each target cell, generate a random-length sequence of coordinates offset by
# entries in `contour_coordinates`
offsets = contour_coordinates[:n_contour_cells]
contour_cells.append(target_cell + offsets)
return contour_cells
def create_zipf_distribution(rand: np.random.Generator) -> np.ndarray:
return rand.zipf(a=2, size=(50, 5))
def apply_contours(
target_cells: np.ndarray,
contour_cells: list[np.ndarray],
) -> list[np.ndarray]:
target_cells_with_contour = []
for contours, target_cell in zip(contour_cells, target_cells):
target_cells_with_contour.append(
np.concatenate(((target_cell,), contours))
)
return target_cells_with_contour
def create_possible_datasets(
dataset: np.ndarray, target_cells_with_contour: list[np.ndarray],
) -> np.ndarray:
all_datasets_final = []
dataframe_original = dataset.copy()
list_tuples_idx_cells_all_datasets = list(
filter(
lambda x: x,
[list(tuples) for tuples in list(product(*target_cells_with_contour))],
)
)
target_original_cells_coordinates = list(
map(
lambda x: x[0],
[
target_and_contour_cell
for target_and_contour_cell in target_cells_with_contour
],
)
)
for dataset_index_values in list_tuples_idx_cells_all_datasets:
all_datasets = []
for idx_cell in range(len(dataset_index_values)):
dataframe_cpy = dataset.copy()
dataframe_cpy[
target_original_cells_coordinates[idx_cell]
] = dataframe_original[
dataset_index_values[idx_cell]
]
all_datasets.append(dataframe_cpy)
all_datasets_final.append(all_datasets)
return np.array(all_datasets_final)
def main() -> None:
rand = np.random.default_rng(seed=0)
zipf_dataset = create_zipf_distribution(rand)
target_cells = select_target_values(dataset=zipf_dataset, n_target_values=5, rand=rand)
print(target_cells)
contour_cells = select_contours(target_cells=target_cells, rand=rand)
print(contour_cells)
target_cells_with_contour = apply_contours(target_cells, contour_cells)
datasets = create_possible_datasets(dataset=zipf_dataset, target_cells_with_contour=target_cells_with_contour)
print(datasets)
if __name__ == '__main__':
main()
Answered By - Reinderien
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.