Skip to content

Commit 7d96d76

Browse files
committed
add DataFrame conversion function
1 parent c8f952e commit 7d96d76

2 files changed

Lines changed: 256 additions & 2 deletions

File tree

meg_utils/misc.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,45 @@ def _safe_send(msg, parse_mode):
669669
pass
670670

671671

672+
def convert_to_numeric(df, convert_dtypes=True, inplace=True):
673+
"""
674+
Convert DataFrame columns to numeric dtypes where possible.
675+
676+
Tries ``pd.to_numeric`` on every non-numeric column. If all values in a
677+
column convert successfully, the numeric column is kept; otherwise the
678+
original column is retained unchanged. Columns that contain any
679+
non-numeric value (including ``None`` / ``NaN``) will not be converted.
680+
681+
Parameters
682+
----------
683+
df : pandas.DataFrame
684+
Input DataFrame.
685+
convert_dtypes : bool, default=True
686+
If True, call ``df.convert_dtypes()`` at the end to further optimize
687+
dtypes (e.g. nullable integer types, string dtype).
688+
inplace : bool, default=True
689+
If True, modify *df* in place. If False, operate on a copy.
690+
691+
Returns
692+
-------
693+
pandas.DataFrame
694+
DataFrame with eligible columns converted to numeric types.
695+
"""
696+
if not inplace:
697+
df = df.copy()
698+
for col in df.columns:
699+
if pd.api.types.is_numeric_dtype(df[col]):
700+
continue
701+
try:
702+
converted = pd.to_numeric(df[col], errors='raise')
703+
df[col] = converted
704+
except ValueError:
705+
pass
706+
if convert_dtypes:
707+
df = df.convert_dtypes()
708+
return df
709+
710+
672711
def _fmt_duration(seconds):
673712
"""Return human-readable duration."""
674713
seconds = float(seconds)

tests/test_misc.py

Lines changed: 217 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33
"""
44
Tests for meg_utils.misc — focusing on to_long_df / long_df_to_array.
55
"""
6-
import sys; sys.path.append('../..')
6+
import sys, os
7+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
78

89
import unittest
910
import numpy as np
1011
import pandas as pd
1112
import pytest
1213

13-
from meg_utils.misc import to_long_df, long_df_to_array
14+
from meg_utils.misc import to_long_df, long_df_to_array, convert_to_numeric
1415

1516

1617
# ---------------------------------------------------------------------------
@@ -212,5 +213,219 @@ def test_empty_columns_list(self):
212213
long_df_to_array(df, columns=[], value_name='v')
213214

214215

216+
# ---------------------------------------------------------------------------
217+
# convert_to_numeric
218+
# ---------------------------------------------------------------------------
219+
220+
class TestConvertToNumeric:
221+
222+
# --- integer-like strings ---
223+
224+
def test_int_strings_converted(self):
225+
"""Columns of integer strings become numeric."""
226+
df = pd.DataFrame({'a': ['1', '2', '3']})
227+
out = convert_to_numeric(df)
228+
assert pd.api.types.is_numeric_dtype(out['a'])
229+
assert list(out['a']) == [1, 2, 3]
230+
231+
def test_negative_int_strings(self):
232+
df = pd.DataFrame({'a': ['-5', '0', '10']})
233+
out = convert_to_numeric(df)
234+
assert pd.api.types.is_numeric_dtype(out['a'])
235+
assert list(out['a']) == [-5, 0, 10]
236+
237+
# --- float-like strings ---
238+
239+
def test_float_strings_converted(self):
240+
df = pd.DataFrame({'a': ['1.5', '2.7', '3.0']})
241+
out = convert_to_numeric(df)
242+
assert pd.api.types.is_numeric_dtype(out['a'])
243+
np.testing.assert_allclose(out['a'].values, [1.5, 2.7, 3.0])
244+
245+
def test_scientific_notation(self):
246+
df = pd.DataFrame({'a': ['1e3', '2.5e-1', '3E2']})
247+
out = convert_to_numeric(df)
248+
assert pd.api.types.is_numeric_dtype(out['a'])
249+
np.testing.assert_allclose(out['a'].values, [1000.0, 0.25, 300.0])
250+
251+
# --- already numeric columns stay numeric ---
252+
253+
def test_int_column_unchanged(self):
254+
df = pd.DataFrame({'a': [1, 2, 3]})
255+
out = convert_to_numeric(df)
256+
assert pd.api.types.is_numeric_dtype(out['a'])
257+
assert list(out['a']) == [1, 2, 3]
258+
259+
def test_float_column_unchanged(self):
260+
df = pd.DataFrame({'a': [1.1, 2.2, 3.3]})
261+
out = convert_to_numeric(df)
262+
assert pd.api.types.is_numeric_dtype(out['a'])
263+
264+
# --- non-convertible strings stay as strings ---
265+
266+
def test_pure_text_not_converted(self):
267+
df = pd.DataFrame({'a': ['hello', 'world', 'foo']})
268+
out = convert_to_numeric(df)
269+
assert not pd.api.types.is_numeric_dtype(out['a'])
270+
assert list(out['a']) == ['hello', 'world', 'foo']
271+
272+
def test_mixed_text_and_numbers_not_converted(self):
273+
"""If any value would become NaN, the whole column stays unchanged."""
274+
df = pd.DataFrame({'a': ['1', '2', 'three']})
275+
out = convert_to_numeric(df)
276+
assert not pd.api.types.is_numeric_dtype(out['a'])
277+
278+
def test_partial_numeric_not_converted(self):
279+
"""Even a single non-numeric value blocks conversion."""
280+
df = pd.DataFrame({'a': ['1.0', '2.0', 'N/A']})
281+
out = convert_to_numeric(df)
282+
assert not pd.api.types.is_numeric_dtype(out['a'])
283+
284+
# --- NaN / None handling ---
285+
# errors='raise' means any None/NaN in a column blocks conversion
286+
287+
def test_column_with_none_and_numeric_strings_not_converted(self):
288+
"""None among numeric strings blocks conversion (errors='raise')."""
289+
df = pd.DataFrame({'a': ['1', None, '3']})
290+
out = convert_to_numeric(df)
291+
assert not pd.api.types.is_numeric_dtype(out['a'])
292+
293+
def test_all_nan_column_not_converted(self):
294+
"""A column of all None is not converted to numeric."""
295+
df = pd.DataFrame({'a': [None, None, None]})
296+
out = convert_to_numeric(df)
297+
assert not pd.api.types.is_numeric_dtype(out['a'])
298+
299+
def test_existing_nan_with_non_numeric_stays(self):
300+
"""NaN + non-numeric strings: column should not be converted."""
301+
df = pd.DataFrame({'a': ['hello', None, 'world']})
302+
out = convert_to_numeric(df)
303+
assert not pd.api.types.is_numeric_dtype(out['a'])
304+
305+
def test_np_nan_in_numeric_strings_not_converted(self):
306+
"""np.nan among numeric strings also blocks conversion."""
307+
df = pd.DataFrame({'a': ['1', np.nan, '3']})
308+
out = convert_to_numeric(df)
309+
assert not pd.api.types.is_numeric_dtype(out['a'])
310+
311+
# --- multiple columns ---
312+
313+
def test_mixed_columns(self):
314+
"""Each column is handled independently."""
315+
df = pd.DataFrame({
316+
'nums': ['10', '20', '30'],
317+
'text': ['a', 'b', 'c'],
318+
'floats': ['1.1', '2.2', '3.3'],
319+
'mixed': ['1', 'x', '3'],
320+
'ints': [4, 5, 6],
321+
})
322+
out = convert_to_numeric(df)
323+
assert pd.api.types.is_numeric_dtype(out['nums'])
324+
assert not pd.api.types.is_numeric_dtype(out['text'])
325+
assert pd.api.types.is_numeric_dtype(out['floats'])
326+
assert not pd.api.types.is_numeric_dtype(out['mixed'])
327+
assert pd.api.types.is_numeric_dtype(out['ints'])
328+
329+
# --- boolean-like strings ---
330+
331+
def test_boolean_strings_not_numeric(self):
332+
"""'True'/'False' strings should not become numeric (they are not numbers)."""
333+
df = pd.DataFrame({'a': ['True', 'False', 'True']})
334+
out = convert_to_numeric(df)
335+
assert not pd.api.types.is_numeric_dtype(out['a'])
336+
337+
# --- inplace parameter ---
338+
339+
def test_inplace_true_modifies_original(self):
340+
"""With inplace=True (default), the input DataFrame is mutated in place."""
341+
df = pd.DataFrame({'a': ['1', '2', '3']})
342+
convert_to_numeric(df, inplace=True, convert_dtypes=False)
343+
# the column was converted on the original df
344+
assert pd.api.types.is_numeric_dtype(df['a'])
345+
346+
def test_inplace_false_preserves_original(self):
347+
"""With inplace=False, the input DataFrame is not mutated."""
348+
df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['x', 'y', 'z']})
349+
df_orig = df.copy()
350+
out = convert_to_numeric(df, inplace=False)
351+
pd.testing.assert_frame_equal(df, df_orig)
352+
assert pd.api.types.is_numeric_dtype(out['a'])
353+
354+
# --- empty DataFrame ---
355+
356+
def test_empty_dataframe(self):
357+
df = pd.DataFrame()
358+
out = convert_to_numeric(df)
359+
assert out.empty
360+
361+
def test_dataframe_no_rows(self):
362+
df = pd.DataFrame({'a': pd.Series([], dtype='object')})
363+
out = convert_to_numeric(df)
364+
assert len(out) == 0
365+
366+
# --- convert_dtypes parameter ---
367+
368+
def test_convert_dtypes_true_uses_nullable_int(self):
369+
"""With convert_dtypes=True, integer columns use nullable Int64."""
370+
df = pd.DataFrame({'a': ['1', '2', '3']})
371+
out = convert_to_numeric(df, convert_dtypes=True)
372+
assert pd.api.types.is_integer_dtype(out['a'])
373+
374+
def test_convert_dtypes_false_skips_conversion(self):
375+
"""With convert_dtypes=False, no convert_dtypes() call is made."""
376+
df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['x', 'y', 'z']})
377+
out = convert_to_numeric(df, convert_dtypes=False)
378+
assert pd.api.types.is_numeric_dtype(out['a'])
379+
# 'b' should remain object dtype, not StringDtype
380+
assert out['b'].dtype == object
381+
382+
def test_convert_dtypes_true_string_dtype(self):
383+
"""With convert_dtypes=True, text columns get StringDtype."""
384+
df = pd.DataFrame({'a': ['hello', 'world']})
385+
out = convert_to_numeric(df, convert_dtypes=True)
386+
assert pd.api.types.is_string_dtype(out['a'])
387+
388+
# --- edge cases ---
389+
390+
def test_whitespace_strings_not_converted(self):
391+
"""Strings with only whitespace should not become numeric."""
392+
df = pd.DataFrame({'a': [' ', '\t', '\n']})
393+
out = convert_to_numeric(df)
394+
assert not pd.api.types.is_numeric_dtype(out['a'])
395+
396+
def test_numeric_with_whitespace_converted(self):
397+
"""Numeric strings with leading/trailing whitespace can still convert."""
398+
df = pd.DataFrame({'a': [' 1 ', ' 2', '3 ']})
399+
out = convert_to_numeric(df)
400+
assert pd.api.types.is_numeric_dtype(out['a'])
401+
assert list(out['a']) == [1, 2, 3]
402+
403+
def test_inf_strings_converted(self):
404+
"""'inf' and '-inf' are valid numeric values."""
405+
df = pd.DataFrame({'a': ['inf', '-inf', '0']})
406+
out = convert_to_numeric(df)
407+
assert pd.api.types.is_numeric_dtype(out['a'])
408+
assert np.isinf(out['a'].values[:2]).all()
409+
410+
def test_single_column_single_row(self):
411+
df = pd.DataFrame({'a': ['42']})
412+
out = convert_to_numeric(df)
413+
assert pd.api.types.is_numeric_dtype(out['a'])
414+
assert out['a'].iloc[0] == 42
415+
416+
def test_categorical_column_not_converted(self):
417+
"""Categorical string columns should not be converted to numeric."""
418+
df = pd.DataFrame({'a': pd.Categorical(['x', 'y', 'z'])})
419+
out = convert_to_numeric(df)
420+
assert not pd.api.types.is_numeric_dtype(out['a'])
421+
422+
def test_categorical_numeric_stays(self):
423+
"""Categorical columns with numeric categories are already numeric-like."""
424+
df = pd.DataFrame({'a': pd.Categorical([1, 2, 3])})
425+
out = convert_to_numeric(df)
426+
# should not error; exact dtype depends on convert_dtypes behavior
427+
assert len(out) == 3
428+
429+
215430
if __name__ == "__main__":
216431
unittest.main(verbosity=2)

0 commit comments

Comments
 (0)