Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/data_morph/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def __init__(
self.df: pd.DataFrame = self._validate_data(df).pipe(self._scale_data, scale)
"""pandas.DataFrame: DataFrame containing columns x and y."""

self._x = self.df['x'].to_numpy()
self._y = self.df['y'].to_numpy()
Comment on lines +55 to +56
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self._x = self.df['x'].to_numpy()
self._y = self.df['y'].to_numpy()
self._x, self._y = self.df[['x', 'y']].to_numpy().T

Comment on lines +55 to +56
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these be properties? If we change the DataFrame, these will no longer match.


self.name: str = name
"""str: The name to use for the dataset."""

Expand Down
23 changes: 14 additions & 9 deletions src/data_morph/data/stats.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Utility functions for calculating summary statistics."""

from collections import namedtuple
from numbers import Number
from typing import Iterable

import pandas as pd
import numpy as np

SummaryStatistics = namedtuple(
'SummaryStatistics', ['x_mean', 'y_mean', 'x_stdev', 'y_stdev', 'correlation']
Expand All @@ -12,14 +14,17 @@
)


def get_values(df: pd.DataFrame) -> SummaryStatistics:
def get_values(x: Iterable[Number], y: Iterable[Number]) -> SummaryStatistics:
"""
Calculate the summary statistics for the given set of points.

Parameters
----------
df : pandas.DataFrame
A dataset with columns x and y.
x : Iterable[Number]
The ``x`` value of the dataset.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

y : Iterable[Number]
The ``y`` value of the dataset.

Returns
-------
Expand All @@ -28,9 +33,9 @@ def get_values(df: pd.DataFrame) -> SummaryStatistics:
along with the Pearson correlation coefficient between the two.
"""
return SummaryStatistics(
df.x.mean(),
df.y.mean(),
df.x.std(),
df.y.std(),
df.corr().x.y,
np.mean(x),
np.mean(y),
np.std(x, ddof=1),
np.std(y, ddof=1),
np.corrcoef(x, y)[0, 1],
)
65 changes: 42 additions & 23 deletions src/data_morph/morpher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

from collections.abc import Iterable, MutableSequence
from functools import partial
from numbers import Number
from pathlib import Path
Expand Down Expand Up @@ -240,16 +241,26 @@ def _record_frames(
frame_number += 1
return frame_number

def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
def _is_close_enough(
self,
x1: Iterable[Number],
y1: Iterable[Number],
x2: Iterable[Number],
y2: Iterable[Number],
) -> bool:
"""
Check whether the statistics are within the acceptable bounds.

Parameters
----------
df1 : pandas.DataFrame
The original DataFrame.
df2 : pandas.DataFrame
The DataFrame after the latest perturbation.
x1 : Iterable[Number]
The original value of ``x``.
y1 : Iterable[Number]
The original value of ``y``.
x2 : Iterable[Number]
The perturbed value of ``x``.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra space:

Suggested change
The perturbed value of ``x``.
The perturbed value of ``x``.

y2 : Iterable[Number]
The perturbed value of ``y``.

Returns
-------
Expand All @@ -259,32 +270,33 @@ def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
return np.all(
np.abs(
np.subtract(
*(
np.floor(np.array(get_values(data)) * 10**self.decimals)
for data in [df1, df2]
)
np.floor(np.array(get_values(x1, y1)) * 10**self.decimals),
np.floor(np.array(get_values(x2, y2)) * 10**self.decimals),
)
)
== 0
)

def _perturb(
self,
df: pd.DataFrame,
x: MutableSequence[Number],
y: MutableSequence[Number],
target_shape: Shape,
*,
shake: Number,
allowed_dist: Number,
temp: Number,
bounds: BoundingBox,
) -> pd.DataFrame:
) -> tuple[MutableSequence[Number], MutableSequence[Number]]:
"""
Perform one round of perturbation.

Parameters
----------
df : pandas.DataFrame
The data to perturb.
x : MutableSequence[Number]
The ``x`` part of the dataset.
y : MutableSequence[Number]
The ``y`` part of the dataset.
target_shape : Shape
The shape to morph the data into.
shake : numbers.Number
Expand All @@ -301,12 +313,12 @@ def _perturb(

Returns
-------
pandas.DataFrame
tuple[MutableSequence[Number], MutableSequence[Number]]
The input dataset with one point perturbed.
"""
row = self._rng.integers(0, len(df))
initial_x = df.at[row, 'x']
initial_y = df.at[row, 'y']
row = self._rng.integers(0, len(x))
initial_x = x[row]
initial_y = y[row]

# this is the simulated annealing step, if "do_bad", then we are willing to
# accept a new state which is worse than the current one
Expand All @@ -325,10 +337,10 @@ def _perturb(
within_bounds = [new_x, new_y] in bounds
done = close_enough and within_bounds

df.loc[row, 'x'] = new_x
df.loc[row, 'y'] = new_y
x[row] = new_x
y[row] = new_y

return df
return x, y

def morph(
self,
Expand Down Expand Up @@ -471,20 +483,27 @@ def _tweening(
max_value=max_shake,
)

x, y = (
start_shape.df['x'].to_numpy(copy=True),
start_shape.df['y'].to_numpy(copy=True),
)
Comment on lines +486 to +489
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we use the _x and _y from the Dataset.__init__() changes?

Suggested change
x, y = (
start_shape.df['x'].to_numpy(copy=True),
start_shape.df['y'].to_numpy(copy=True),
)
x, y = start_shape._x, start_shape._y

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also wondering if we need to copy here, when we copy in the loop.


for i in self._looper(
iterations, leave=True, ascii=True, desc=f'{target_shape} pattern'
):
perturbed_data = self._perturb(
morphed_data.copy(),
np.copy(x),
np.copy(y),
target_shape=target_shape,
shake=get_current_shake(i),
allowed_dist=allowed_dist,
temp=get_current_temp(i),
bounds=start_shape.morph_bounds,
)

if self._is_close_enough(start_shape.df, perturbed_data):
morphed_data = perturbed_data
if self._is_close_enough(x, y, *perturbed_data):
x, y = perturbed_data
morphed_data = pd.DataFrame({'x': x, 'y': y})
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't necessary in the loop with switch to NumPy. We can have _record_frames() only make the DataFrame if we need to save the CSV. The plot() function can be reworked to use NumPy, and to return the DataFrame at the end of this method, we can do that outside of this loop instead of doing it thousands of times.


frame_number = record_frames(
data=morphed_data,
Expand Down
2 changes: 1 addition & 1 deletion src/data_morph/plotting/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def plot(
ax.xaxis.set_major_formatter(tick_formatter)
ax.yaxis.set_major_formatter(tick_formatter)

res = get_values(df)
res = get_values(df['x'].to_numpy(), df['y'].to_numpy())

labels = ('X Mean', 'Y Mean', 'X SD', 'Y SD', 'Corr.')
locs = np.linspace(0.8, 0.2, num=len(labels))
Expand Down
6 changes: 4 additions & 2 deletions tests/data/test_stats.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Test the stats module."""

import numpy as np

from data_morph.data.loader import DataLoader
from data_morph.data.stats import get_values

Expand All @@ -9,10 +11,10 @@ def test_stats():

data = DataLoader.load_dataset('dino').df

stats = get_values(data)
stats = get_values(data['x'], data['y'])

assert stats.x_mean == data.x.mean()
assert stats.y_mean == data.y.mean()
assert stats.x_stdev == data.x.std()
assert stats.y_stdev == data.y.std()
assert stats.correlation == data.corr().x.y
np.allclose(stats.correlation, data.corr().x.y)
4 changes: 3 additions & 1 deletion tests/test_morpher.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ def test_no_writing(self, capsys):

with pytest.raises(AssertionError):
assert_frame_equal(morphed_data, dataset.df)
assert morpher._is_close_enough(dataset.df, morphed_data)
assert morpher._is_close_enough(
dataset.df['x'], dataset.df['y'], morphed_data['x'], morphed_data['y']
)

_, err = capsys.readouterr()
assert f'{target_shape} pattern: 100%' in err
Expand Down