|
3 | 3 | """ |
4 | 4 | Tests for meg_utils.misc — focusing on to_long_df / long_df_to_array. |
5 | 5 | """ |
6 | | -import sys; sys.path.append('../..') |
| 6 | +import sys, os |
| 7 | +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
7 | 8 |
|
8 | 9 | import unittest |
9 | 10 | import numpy as np |
10 | 11 | import pandas as pd |
11 | 12 | import pytest |
12 | 13 |
|
13 | | -from meg_utils.misc import to_long_df, long_df_to_array |
| 14 | +from meg_utils.misc import to_long_df, long_df_to_array, convert_to_numeric |
14 | 15 |
|
15 | 16 |
|
16 | 17 | # --------------------------------------------------------------------------- |
@@ -212,5 +213,219 @@ def test_empty_columns_list(self): |
212 | 213 | long_df_to_array(df, columns=[], value_name='v') |
213 | 214 |
|
214 | 215 |
|
| 216 | +# --------------------------------------------------------------------------- |
| 217 | +# convert_to_numeric |
| 218 | +# --------------------------------------------------------------------------- |
| 219 | + |
| 220 | +class TestConvertToNumeric: |
| 221 | + |
| 222 | + # --- integer-like strings --- |
| 223 | + |
| 224 | + def test_int_strings_converted(self): |
| 225 | + """Columns of integer strings become numeric.""" |
| 226 | + df = pd.DataFrame({'a': ['1', '2', '3']}) |
| 227 | + out = convert_to_numeric(df) |
| 228 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 229 | + assert list(out['a']) == [1, 2, 3] |
| 230 | + |
| 231 | + def test_negative_int_strings(self): |
| 232 | + df = pd.DataFrame({'a': ['-5', '0', '10']}) |
| 233 | + out = convert_to_numeric(df) |
| 234 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 235 | + assert list(out['a']) == [-5, 0, 10] |
| 236 | + |
| 237 | + # --- float-like strings --- |
| 238 | + |
| 239 | + def test_float_strings_converted(self): |
| 240 | + df = pd.DataFrame({'a': ['1.5', '2.7', '3.0']}) |
| 241 | + out = convert_to_numeric(df) |
| 242 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 243 | + np.testing.assert_allclose(out['a'].values, [1.5, 2.7, 3.0]) |
| 244 | + |
| 245 | + def test_scientific_notation(self): |
| 246 | + df = pd.DataFrame({'a': ['1e3', '2.5e-1', '3E2']}) |
| 247 | + out = convert_to_numeric(df) |
| 248 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 249 | + np.testing.assert_allclose(out['a'].values, [1000.0, 0.25, 300.0]) |
| 250 | + |
| 251 | + # --- already numeric columns stay numeric --- |
| 252 | + |
| 253 | + def test_int_column_unchanged(self): |
| 254 | + df = pd.DataFrame({'a': [1, 2, 3]}) |
| 255 | + out = convert_to_numeric(df) |
| 256 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 257 | + assert list(out['a']) == [1, 2, 3] |
| 258 | + |
| 259 | + def test_float_column_unchanged(self): |
| 260 | + df = pd.DataFrame({'a': [1.1, 2.2, 3.3]}) |
| 261 | + out = convert_to_numeric(df) |
| 262 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 263 | + |
| 264 | + # --- non-convertible strings stay as strings --- |
| 265 | + |
| 266 | + def test_pure_text_not_converted(self): |
| 267 | + df = pd.DataFrame({'a': ['hello', 'world', 'foo']}) |
| 268 | + out = convert_to_numeric(df) |
| 269 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 270 | + assert list(out['a']) == ['hello', 'world', 'foo'] |
| 271 | + |
| 272 | + def test_mixed_text_and_numbers_not_converted(self): |
| 273 | + """If any value would become NaN, the whole column stays unchanged.""" |
| 274 | + df = pd.DataFrame({'a': ['1', '2', 'three']}) |
| 275 | + out = convert_to_numeric(df) |
| 276 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 277 | + |
| 278 | + def test_partial_numeric_not_converted(self): |
| 279 | + """Even a single non-numeric value blocks conversion.""" |
| 280 | + df = pd.DataFrame({'a': ['1.0', '2.0', 'N/A']}) |
| 281 | + out = convert_to_numeric(df) |
| 282 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 283 | + |
| 284 | + # --- NaN / None handling --- |
| 285 | + # errors='raise' means any None/NaN in a column blocks conversion |
| 286 | + |
| 287 | + def test_column_with_none_and_numeric_strings_not_converted(self): |
| 288 | + """None among numeric strings blocks conversion (errors='raise').""" |
| 289 | + df = pd.DataFrame({'a': ['1', None, '3']}) |
| 290 | + out = convert_to_numeric(df) |
| 291 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 292 | + |
| 293 | + def test_all_nan_column_not_converted(self): |
| 294 | + """A column of all None is not converted to numeric.""" |
| 295 | + df = pd.DataFrame({'a': [None, None, None]}) |
| 296 | + out = convert_to_numeric(df) |
| 297 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 298 | + |
| 299 | + def test_existing_nan_with_non_numeric_stays(self): |
| 300 | + """NaN + non-numeric strings: column should not be converted.""" |
| 301 | + df = pd.DataFrame({'a': ['hello', None, 'world']}) |
| 302 | + out = convert_to_numeric(df) |
| 303 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 304 | + |
| 305 | + def test_np_nan_in_numeric_strings_not_converted(self): |
| 306 | + """np.nan among numeric strings also blocks conversion.""" |
| 307 | + df = pd.DataFrame({'a': ['1', np.nan, '3']}) |
| 308 | + out = convert_to_numeric(df) |
| 309 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 310 | + |
| 311 | + # --- multiple columns --- |
| 312 | + |
| 313 | + def test_mixed_columns(self): |
| 314 | + """Each column is handled independently.""" |
| 315 | + df = pd.DataFrame({ |
| 316 | + 'nums': ['10', '20', '30'], |
| 317 | + 'text': ['a', 'b', 'c'], |
| 318 | + 'floats': ['1.1', '2.2', '3.3'], |
| 319 | + 'mixed': ['1', 'x', '3'], |
| 320 | + 'ints': [4, 5, 6], |
| 321 | + }) |
| 322 | + out = convert_to_numeric(df) |
| 323 | + assert pd.api.types.is_numeric_dtype(out['nums']) |
| 324 | + assert not pd.api.types.is_numeric_dtype(out['text']) |
| 325 | + assert pd.api.types.is_numeric_dtype(out['floats']) |
| 326 | + assert not pd.api.types.is_numeric_dtype(out['mixed']) |
| 327 | + assert pd.api.types.is_numeric_dtype(out['ints']) |
| 328 | + |
| 329 | + # --- boolean-like strings --- |
| 330 | + |
| 331 | + def test_boolean_strings_not_numeric(self): |
| 332 | + """'True'/'False' strings should not become numeric (they are not numbers).""" |
| 333 | + df = pd.DataFrame({'a': ['True', 'False', 'True']}) |
| 334 | + out = convert_to_numeric(df) |
| 335 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 336 | + |
| 337 | + # --- inplace parameter --- |
| 338 | + |
| 339 | + def test_inplace_true_modifies_original(self): |
| 340 | + """With inplace=True (default), the input DataFrame is mutated in place.""" |
| 341 | + df = pd.DataFrame({'a': ['1', '2', '3']}) |
| 342 | + convert_to_numeric(df, inplace=True, convert_dtypes=False) |
| 343 | + # the column was converted on the original df |
| 344 | + assert pd.api.types.is_numeric_dtype(df['a']) |
| 345 | + |
| 346 | + def test_inplace_false_preserves_original(self): |
| 347 | + """With inplace=False, the input DataFrame is not mutated.""" |
| 348 | + df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['x', 'y', 'z']}) |
| 349 | + df_orig = df.copy() |
| 350 | + out = convert_to_numeric(df, inplace=False) |
| 351 | + pd.testing.assert_frame_equal(df, df_orig) |
| 352 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 353 | + |
| 354 | + # --- empty DataFrame --- |
| 355 | + |
| 356 | + def test_empty_dataframe(self): |
| 357 | + df = pd.DataFrame() |
| 358 | + out = convert_to_numeric(df) |
| 359 | + assert out.empty |
| 360 | + |
| 361 | + def test_dataframe_no_rows(self): |
| 362 | + df = pd.DataFrame({'a': pd.Series([], dtype='object')}) |
| 363 | + out = convert_to_numeric(df) |
| 364 | + assert len(out) == 0 |
| 365 | + |
| 366 | + # --- convert_dtypes parameter --- |
| 367 | + |
| 368 | + def test_convert_dtypes_true_uses_nullable_int(self): |
| 369 | + """With convert_dtypes=True, integer columns use nullable Int64.""" |
| 370 | + df = pd.DataFrame({'a': ['1', '2', '3']}) |
| 371 | + out = convert_to_numeric(df, convert_dtypes=True) |
| 372 | + assert pd.api.types.is_integer_dtype(out['a']) |
| 373 | + |
| 374 | + def test_convert_dtypes_false_skips_conversion(self): |
| 375 | + """With convert_dtypes=False, no convert_dtypes() call is made.""" |
| 376 | + df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['x', 'y', 'z']}) |
| 377 | + out = convert_to_numeric(df, convert_dtypes=False) |
| 378 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 379 | + # 'b' should remain object dtype, not StringDtype |
| 380 | + assert out['b'].dtype == object |
| 381 | + |
| 382 | + def test_convert_dtypes_true_string_dtype(self): |
| 383 | + """With convert_dtypes=True, text columns get StringDtype.""" |
| 384 | + df = pd.DataFrame({'a': ['hello', 'world']}) |
| 385 | + out = convert_to_numeric(df, convert_dtypes=True) |
| 386 | + assert pd.api.types.is_string_dtype(out['a']) |
| 387 | + |
| 388 | + # --- edge cases --- |
| 389 | + |
| 390 | + def test_whitespace_strings_not_converted(self): |
| 391 | + """Strings with only whitespace should not become numeric.""" |
| 392 | + df = pd.DataFrame({'a': [' ', '\t', '\n']}) |
| 393 | + out = convert_to_numeric(df) |
| 394 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 395 | + |
| 396 | + def test_numeric_with_whitespace_converted(self): |
| 397 | + """Numeric strings with leading/trailing whitespace can still convert.""" |
| 398 | + df = pd.DataFrame({'a': [' 1 ', ' 2', '3 ']}) |
| 399 | + out = convert_to_numeric(df) |
| 400 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 401 | + assert list(out['a']) == [1, 2, 3] |
| 402 | + |
| 403 | + def test_inf_strings_converted(self): |
| 404 | + """'inf' and '-inf' are valid numeric values.""" |
| 405 | + df = pd.DataFrame({'a': ['inf', '-inf', '0']}) |
| 406 | + out = convert_to_numeric(df) |
| 407 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 408 | + assert np.isinf(out['a'].values[:2]).all() |
| 409 | + |
| 410 | + def test_single_column_single_row(self): |
| 411 | + df = pd.DataFrame({'a': ['42']}) |
| 412 | + out = convert_to_numeric(df) |
| 413 | + assert pd.api.types.is_numeric_dtype(out['a']) |
| 414 | + assert out['a'].iloc[0] == 42 |
| 415 | + |
| 416 | + def test_categorical_column_not_converted(self): |
| 417 | + """Categorical string columns should not be converted to numeric.""" |
| 418 | + df = pd.DataFrame({'a': pd.Categorical(['x', 'y', 'z'])}) |
| 419 | + out = convert_to_numeric(df) |
| 420 | + assert not pd.api.types.is_numeric_dtype(out['a']) |
| 421 | + |
| 422 | + def test_categorical_numeric_stays(self): |
| 423 | + """Categorical columns with numeric categories are already numeric-like.""" |
| 424 | + df = pd.DataFrame({'a': pd.Categorical([1, 2, 3])}) |
| 425 | + out = convert_to_numeric(df) |
| 426 | + # should not error; exact dtype depends on convert_dtypes behavior |
| 427 | + assert len(out) == 3 |
| 428 | + |
| 429 | + |
215 | 430 | if __name__ == "__main__": |
216 | 431 | unittest.main(verbosity=2) |
0 commit comments