Monday, January 29, 2024

[FIXED] How to replace a cell value with each of its contour cells and yield the corresponding datasets seperately in a list according to a Pandas-way?

January 29, 2024 numpy, pandas, python No comments

Issue

I have an initial dataframe with a random list of target cells (each cell being identified with a couple (x,y)). I want to yield four different dataframes each containing the value of one of the contour (surrounding) cells of each specified target cell.

the surrounding cells to consider for a specific target cell are : (x-1,y), (x,y-1),(x+1,y);(x,y+1), specifically I randomly choose 1 to 4 cells from these and consider for replacement to the target cell.

I want to do that through a pandas-specific approach without having to define the contour cells separately and then apply the changes on the dataframe (but rather using an all in one approach). I think using a Pandas (or a Numpy) approach would optimize the execution of the program.

Target cells could be up to 10% the size of the dataset, a good example to start with would have from 10 to 100 values.

for now I have written this example which I think is not Pandas specific:

from numpy import random
import pandas as pd
import numpy as np
import operator
import math
from collections import deque
from queue import *
from queue import Queue
from itertools import product


def select_target_values(dataframe, number_of_target_values):
    target_cells = []
    for _ in range(number_of_target_values):
        row_x = random.randint(0, len(dataframe.columns) - 1)
        col_y = random.randint(0, len(dataframe) - 1)
        target_cells.append((row_x, col_y))
    return target_cells


def select_contours(target_cells):
    contour_coordinates = [(0, 1), (1, 0), (0, -1), (-1, 0)]
    contour_cells = []
    for target_cell in target_cells:
        # random contour count for each cell
        contour_cells_count = random.randint(1, 4)
        try:
            contour_cells.append(
                [
                    tuple(
                        map(
                            lambda i, j: i + j,
                            (target_cell[0], target_cell[1]),
                            contour_coordinates[iteration_],
                        )
                    )
                    for iteration_ in range(contour_cells_count)
                ]
            )
        except IndexError:
            continue
    return contour_cells


def create_zipf_distribution():
    zipf_dist = random.zipf(2, size=(50, 5)).reshape((50, 5))

    zipf_distribution_dataset = pd.DataFrame(zipf_dist).round(3)

    return zipf_distribution_dataset


def apply_contours(target_cells, contour_cells):
    target_cells_with_contour = []
    # create one single list of cells
    for idx, target_cell in enumerate(target_cells):
        target_cell_with_contour = [target_cell]
        target_cell_with_contour.extend(contour_cells[idx])
        target_cells_with_contour.append(target_cell_with_contour)
    return target_cells_with_contour


def create_possible_datasets(dataframe, target_cells_with_contour):
    all_datasets_final = []
    dataframe_original = dataframe.copy()

    list_tuples_idx_cells_all_datasets = list(
        filter(
            lambda x: x,
            [list(tuples) for tuples in list(product(*target_cells_with_contour))],
        )
    )
    target_original_cells_coordinates = list(
        map(
            lambda x: x[0],
            [
                target_and_contour_cell
                for target_and_contour_cell in target_cells_with_contour
            ],
        )
    )
    for dataset_index_values in list_tuples_idx_cells_all_datasets:
        all_datasets = []
        for idx_cell in range(len(dataset_index_values)):
            dataframe_cpy = dataframe.copy()
            dataframe_cpy.iat[
                target_original_cells_coordinates[idx_cell][1],
                target_original_cells_coordinates[idx_cell][0],
            ] = dataframe_original.iloc[
                dataset_index_values[idx_cell][1], dataset_index_values[idx_cell][0]
            ]
            all_datasets.append(dataframe_cpy)
        all_datasets_final.append(all_datasets)
    return all_datasets_final


def main():
    zipf_dataset = create_zipf_distribution()

    target_cells = select_target_values(zipf_dataset, 5)
    print(target_cells)
    contour_cells = select_contours(target_cells)
    print(contour_cells)
    target_cells_with_contour = apply_contours(target_cells, contour_cells)
    datasets = create_possible_datasets(zipf_dataset, target_cells_with_contour)
    print(datasets)


main()

If you have a better Pandas approach (unifying all these methods into one that make use of dataframe methods only) please let me know.

Solution

This is not a Pandas question; it's a Numpy question. Your code is pulled inappropriately in the direction of Pandas, and also inappropriately in the direction of pure Python iterations; neither should be used when base Numpy is available. The following has refactored everything somewhat heavily except create_possible_datasets which needs more work. Your use of product creates a size explosion and it's unclear what you're doing there or why that is necessary.

import numpy as np
from itertools import product


def select_target_values(
    dataset: np.ndarray, n_target_values: int, rand: np.random.Generator,
) -> np.ndarray:
    return rand.integers(dataset.shape, size=(n_target_values, 2))


def select_contours(
    target_cells: np.ndarray, rand: np.random.Generator,
) -> list[np.ndarray]:
    contour_coordinates = ((0, 1), (1, 0), (0, -1), (-1, 0))

    # Use a list because the output is jagged
    contour_cells = []

    for target_cell, n_contour_cells in zip(
        target_cells,
        rand.integers(
            low=1, high=1 + len(contour_coordinates),
            size=target_cells.shape[0],
        ),
    ):
        # For each target cell, generate a random-length sequence of coordinates offset by
        # entries in `contour_coordinates`
        offsets = contour_coordinates[:n_contour_cells]
        contour_cells.append(target_cell + offsets)

    return contour_cells


def create_zipf_distribution(rand: np.random.Generator) -> np.ndarray:
    return rand.zipf(a=2, size=(50, 5))


def apply_contours(
    target_cells: np.ndarray,
    contour_cells: list[np.ndarray],
) -> list[np.ndarray]:
    target_cells_with_contour = []
    for contours, target_cell in zip(contour_cells, target_cells):
        target_cells_with_contour.append(
            np.concatenate(((target_cell,), contours))
        )
    return target_cells_with_contour


def create_possible_datasets(
    dataset: np.ndarray, target_cells_with_contour: list[np.ndarray],
) -> np.ndarray:
    all_datasets_final = []
    dataframe_original = dataset.copy()

    list_tuples_idx_cells_all_datasets = list(
        filter(
            lambda x: x,
            [list(tuples) for tuples in list(product(*target_cells_with_contour))],
        )
    )
    target_original_cells_coordinates = list(
        map(
            lambda x: x[0],
            [
                target_and_contour_cell
                for target_and_contour_cell in target_cells_with_contour
            ],
        )
    )
    for dataset_index_values in list_tuples_idx_cells_all_datasets:
        all_datasets = []
        for idx_cell in range(len(dataset_index_values)):
            dataframe_cpy = dataset.copy()
            dataframe_cpy[
                target_original_cells_coordinates[idx_cell]
            ] = dataframe_original[
                dataset_index_values[idx_cell]
            ]
            all_datasets.append(dataframe_cpy)
        all_datasets_final.append(all_datasets)
    return np.array(all_datasets_final)


def main() -> None:
    rand = np.random.default_rng(seed=0)
    zipf_dataset = create_zipf_distribution(rand)

    target_cells = select_target_values(dataset=zipf_dataset, n_target_values=5, rand=rand)
    print(target_cells)

    contour_cells = select_contours(target_cells=target_cells, rand=rand)
    print(contour_cells)

    target_cells_with_contour = apply_contours(target_cells, contour_cells)
    datasets = create_possible_datasets(dataset=zipf_dataset, target_cells_with_contour=target_cells_with_contour)
    print(datasets)


if __name__ == '__main__':
    main()

Answered By - Reinderien

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Monday, January 29, 2024

[FIXED] How to replace a cell value with each of its contour cells and yield the corresponding datasets seperately in a list according to a Pandas-way?

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels