From 1f05349612bcc8b59ce2ab8149af62cb487c61b6 Mon Sep 17 00:00:00 2001 From: janezd Date: Fri, 22 Aug 2025 23:32:04 +0200 Subject: [PATCH] Split: Remove the widget (being moved to core) --- .../prototypes/widgets/icons/Split.svg | 33 -- orangecontrib/prototypes/widgets/owsplit.py | 237 ------------- .../widgets/tests/orange-in-education.tab | 103 ------ .../prototypes/widgets/tests/test_owsplit.py | 328 ------------------ 4 files changed, 701 deletions(-) delete mode 100644 orangecontrib/prototypes/widgets/icons/Split.svg delete mode 100644 orangecontrib/prototypes/widgets/owsplit.py delete mode 100644 orangecontrib/prototypes/widgets/tests/orange-in-education.tab delete mode 100644 orangecontrib/prototypes/widgets/tests/test_owsplit.py diff --git a/orangecontrib/prototypes/widgets/icons/Split.svg b/orangecontrib/prototypes/widgets/icons/Split.svg deleted file mode 100644 index bc42545f..00000000 --- a/orangecontrib/prototypes/widgets/icons/Split.svg +++ /dev/null @@ -1,33 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/orangecontrib/prototypes/widgets/owsplit.py b/orangecontrib/prototypes/widgets/owsplit.py deleted file mode 100644 index 05fe042a..00000000 --- a/orangecontrib/prototypes/widgets/owsplit.py +++ /dev/null @@ -1,237 +0,0 @@ -from functools import partial - -import numpy as np - -from AnyQt.QtCore import Qt - -from Orange.widgets import gui -from Orange.widgets.settings import ContextSetting, DomainContextHandler -from Orange.widgets.widget import OWWidget, Msg, Output, Input -from Orange.widgets.utils.itemmodels import DomainModel -from Orange.widgets.utils.widgetpreview import WidgetPreview -from Orange.data import \ - Table, Domain, DiscreteVariable, StringVariable, ContinuousVariable -from Orange.data.util import SharedComputeValue, get_unique_names - -from orangewidget.settings import Setting - - -def get_substrings(values, delimiter): - return sorted({ss.strip() for s in values for ss in s.split(delimiter)} - - {""}) - - -class SplitColumnBase: - def __init__(self, data, attr, delimiter): - self.attr = attr - self.delimiter = delimiter - column = set(data.get_column(self.attr)) - self.new_values = tuple(get_substrings(column, self.delimiter)) - - def __eq__(self, other): - return self.attr == other.attr \ - and self.delimiter == other.delimiter \ - and self.new_values == other.new_values - - def __hash__(self): - return hash((self.attr, self.delimiter, self.new_values)) - - -class SplitColumnOneHot(SplitColumnBase): - InheritEq = True - - def __call__(self, data): - column = data.get_column(self.attr) - values = [{ss.strip() for ss in s.split(self.delimiter)} - for s in column] - return {v: np.array([i for i, xs in enumerate(values) if v in xs], - dtype=int) - for v in self.new_values} - - -class SplitColumnCounts(SplitColumnBase): - InheritEq = True - - def __call__(self, data): - column = data.get_column(self.attr) - values = [[ss.strip() for ss in s.split(self.delimiter)] - for s in column] - return {v: np.array([xs.count(v) for xs in values], dtype=float) - for v in self.new_values} - - -class StringEncodingBase(SharedComputeValue): - def __init__(self, fn, new_feature): - super().__init__(fn) - self.new_feature = new_feature - - def __eq__(self, other): - return super().__eq__(other) and self.new_feature == other.new_feature - - def __hash__(self): - return super().__hash__() ^ hash(self.new_feature) - - -class OneHotStrings(StringEncodingBase): - InheritEq = True - - def compute(self, data, shared_data): - indices = shared_data[self.new_feature] - col = np.zeros(len(data)) - col[indices] = 1 - return col - - -class CountStrings(StringEncodingBase): - InheritEq = True - - def compute(self, data, shared_data): - return shared_data[self.new_feature] - - -class DiscreteEncoding: - def __init__(self, variable, delimiter, onehot, value): - self.variable = variable - self.delimiter = delimiter - self.onehot = onehot - self.value = value - - def __call__(self, data): - column = data.get_column(self.variable).astype(float) - col = np.zeros(len(column)) - col[np.isnan(column)] = np.nan - for val_idx, value in enumerate(self.variable.values): - parts = value.split(self.delimiter) - if self.onehot: - col[column == val_idx] = int(self.value in parts) - else: - col[column == val_idx] = parts.count(self.value) - return col - - def __eq__(self, other): - return self.variable == other.variable \ - and self.value == other.value \ - and self.delimiter == other.delimiter \ - and self.onehot == other.onehot - - def __hash__(self): - return hash((self.variable, self.value, self.delimiter, self.onehot)) - - -class OWSplit(OWWidget): - name = "Split" - description = "Split text or categorical variables into binary indicators" - icon = "icons/Split.svg" - keywords = ["text to columns", "word encoding", "questionnaire", "survey", - "term", "word presence", "word counts", "categorical encoding", - "indicator variables"] - priority = 700 - replaces = ["orangecontrib.prototypes.widgets.owsplit.OWSplit"] - - class Inputs: - data = Input("Data", Table) - - class Outputs: - data = Output("Data", Table) - - class Warning(OWWidget.Warning): - no_disc = Msg("Data contains only numeric variables.") - - want_main_area = False - resizing_enabled = False - - Categorical, Numerical, Counts = range(3) - OutputLabels = ("Categorical (No, Yes)", "Numerical (0, 1)", "Counts") - - settingsHandler = DomainContextHandler() - attribute = ContextSetting(None) - delimiter = ContextSetting(";") - output_type = ContextSetting(Categorical) - auto_apply = Setting(True) - - def __init__(self): - super().__init__() - self.data = None - - variable_select_box = gui.vBox(self.controlArea, "Variable") - - gui.comboBox(variable_select_box, self, "attribute", - orientation=Qt.Horizontal, searchable=True, - callback=self.apply.deferred, - model=DomainModel(valid_types=(StringVariable, - DiscreteVariable))) - gui.lineEdit( - variable_select_box, self, "delimiter", "Delimiter: ", - orientation=Qt.Horizontal, callback=self.apply.deferred, - controlWidth=20).box.layout().addStretch(1) - - gui.radioButtonsInBox( - self.controlArea, self, "output_type", self.OutputLabels, - box="Output Values", - callback=self.apply.deferred) - - gui.auto_apply(self.buttonsArea, self, commit=self.apply) - - @Inputs.data - def set_data(self, data): - self.closeContext() - self.data = data - - model = self.controls.attribute.model() - model.set_domain(data.domain if data is not None else None) - self.Warning.no_disc(shown=data is not None and not model) - if not model: - self.attribute = None - self.data = None - return - self.attribute = model[0] - self.openContext(data) - self.apply.now() - - @gui.deferred - def apply(self): - if self.attribute is None: - self.Outputs.data.send(None) - return - var = self.data.domain[self.attribute] - values, computer = self._get_compute_value(var) - new_columns = self._get_new_columns(values, computer) - new_domain = Domain( - self.data.domain.attributes + new_columns, - self.data.domain.class_vars, self.data.domain.metas - ) - extended_data = self.data.transform(new_domain) - self.Outputs.data.send(extended_data) - - def _get_compute_value(self, var): - if var.is_discrete: - values = get_substrings(var.values, self.delimiter) - computer = partial( - DiscreteEncoding, - var, self.delimiter, self.output_type != self.Counts) - else: - if self.output_type == self.Counts: - sc = SplitColumnCounts(self.data, var, self.delimiter) - computer = partial(CountStrings, sc) - else: - sc = SplitColumnOneHot(self.data, var, self.delimiter) - computer = partial(OneHotStrings, sc) - values = sc.new_values - return values, computer - - def _get_new_columns(self, values, computer): - names = get_unique_names(self.data.domain, values, equal_numbers=False) - if self.output_type == self.Categorical: - return tuple( - DiscreteVariable( - name, ("No", "Yes"), compute_value=computer(value)) - for value, name in zip(values, names)) - else: - return tuple( - ContinuousVariable( - name, compute_value=computer(value)) - for value, name in zip(values, names)) - - -if __name__ == "__main__": # pragma: no cover - WidgetPreview(OWSplit).run(Table.from_file("tests/orange-in-education.tab")) diff --git a/orangecontrib/prototypes/widgets/tests/orange-in-education.tab b/orangecontrib/prototypes/widgets/tests/orange-in-education.tab deleted file mode 100644 index 3f67d1c0..00000000 --- a/orangecontrib/prototypes/widgets/tests/orange-in-education.tab +++ /dev/null @@ -1,103 +0,0 @@ -Role Orange use Familiar with Timestamp Country Classes with Orange -professor student teaching\ assistant in-class,\ in\ hands-on\ workshops in-class,\ in\ hands-on\ workshops;outside\ the\ classroom in-class,\ in\ lectures in-class,\ in\ lectures;in-class,\ in\ hands-on\ workshops in-class,\ in\ lectures;in-class,\ in\ hands-on\ workshops;outside\ the\ classroom in-class,\ in\ lectures;outside\ the\ classroom outside\ the\ classroom YouTube\ videos YouTube\ videos;lectures\ notes\ published\ on\ the\ Orange\ blog YouTube\ videos;lectures\ notes\ published\ on\ the\ Orange\ blog;published\ literature YouTube\ videos;published\ literature lectures\ notes\ published\ on\ the\ Orange\ blog lectures\ notes\ published\ on\ the\ Orange\ blog;published\ literature published\ literature time string string - meta meta meta -professor outside the classroom YouTube videos;lectures notes published on the Orange blog 2020-12-12 09:06:34 Pakistan Machine Learning -professor in-class, in lectures YouTube videos 2021-03-19 21:36:49 Portugal Data mining -student in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2020-12-10 03:35:34 Canada - Ontario prediction -student outside the classroom 2021-04-12 11:15:13 Italy computer science -professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog;published literature 2021-03-30 01:18:39 Ecuador computer science;text mining -student in-class, in hands-on workshops YouTube videos 2021-03-31 01:54:17 France business analytics -professor in-class, in lectures YouTube videos 2020-12-10 16:51:59 Germany Material Science -professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog;published literature 2021-03-29 04:39:05 Canada computer science -student in-class, in lectures;outside the classroom YouTube videos;lectures notes published on the Orange blog 2020-12-10 23:36:42 Sweden digital humanities -professor outside the classroom YouTube videos 2021-04-13 15:18:12 Brazil computer science;text mining -student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog;published literature 2021-03-27 19:43:11 Czech Republic big data analysis in management -teaching assistant in-class, in lectures YouTube videos 2020-12-11 13:39:51 Indonesia computer science;text mining -professor in-class, in lectures;in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2021-04-17 23:57:00 Switzerland digital humanities -professor in-class, in lectures YouTube videos;lectures notes published on the Orange blog;published literature 2020-12-11 07:26:54 Bulgaria computer science -professor in-class, in lectures;in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2020-12-16 14:49:04 Spain computer science;text mining -student in-class, in lectures;in-class, in hands-on workshops YouTube videos 2021-03-24 08:09:51 India data science -student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos 2020-12-18 11:11:20 United Kingdom computer science -student in-class, in lectures YouTube videos 2020-12-20 12:07:00 Turkey digital humanities -student outside the classroom YouTube videos 2021-04-22 04:04:37 Argentina data science -student outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-04-05 07:34:26 Indonesia biology -professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom 2020-12-10 13:11:04 Latvia computer science;text mining -teaching assistant in-class, in hands-on workshops;outside the classroom lectures notes published on the Orange blog;published literature 2020-12-16 16:48:41 Portugal text mining -teaching assistant in-class, in lectures;in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2021-04-16 07:33:47 Egypt computer science -professor outside the classroom published literature 2021-04-08 03:01:17 Brazil digital humanities -professor in-class, in lectures YouTube videos 2020-12-15 04:49:08 India Management -student in-class, in lectures published literature 2020-12-12 18:52:57 Colombia text mining -professor in-class, in lectures YouTube videos 2020-12-17 16:54:40 Turkey computer science -student outside the classroom YouTube videos 2020-12-10 18:43:37 Ireland computer science -professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos 2020-12-11 08:23:55 India Business Administration -student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-03-22 11:44:04 Turkey Data Mining -student in-class, in lectures;outside the classroom YouTube videos 2021-03-20 16:03:47 Netherlands digital humanities -student in-class, in lectures;in-class, in hands-on workshops YouTube videos 2020-12-12 07:59:12 Indonesia computer science;text mining -student in-class, in lectures lectures notes published on the Orange blog 2021-03-21 17:38:31 Saudi Arabia Statistics -student outside the classroom YouTube videos 2021-03-26 21:24:08 United States of America - Massachusetts computer science;text mining -student in-class, in lectures;outside the classroom 2020-12-11 00:42:56 Malaysia computer science -teaching assistant outside the classroom YouTube videos 2021-03-21 21:23:57 United States of America - California Astronomy -professor in-class, in lectures;in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2020-12-10 22:16:14 Brazil computer science;data science -professor in-class, in hands-on workshops 2021-03-29 19:06:32 France Final studies project -student outside the classroom YouTube videos 2020-12-13 02:38:30 Australia Chemistry -professor outside the classroom 2021-04-06 19:26:58 China biology -student in-class, in hands-on workshops published literature 2020-12-12 22:27:35 China - Hong Kong SAR computer science -professor outside the classroom YouTube videos 2021-04-06 12:09:38 New Zealand text mining -professor in-class, in lectures;in-class, in hands-on workshops YouTube videos 2021-04-09 15:37:47 France computer science;text mining;data mining -teaching assistant outside the classroom YouTube videos 2020-12-19 12:45:24 Saudi Arabia computer science -professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2020-12-16 17:42:32 Brazil computer science -professor in-class, in hands-on workshops YouTube videos 2020-12-10 09:03:23 Russian Federation sport sciences -professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos 2021-03-19 23:54:38 Portugal Data Mining -professor outside the classroom YouTube videos;lectures notes published on the Orange blog;published literature 2021-03-29 17:27:09 Philippines text mining;Research Methods in Medicine -student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;published literature 2020-12-16 17:46:16 Ukraine computer science;artificial intelligence -professor in-class, in hands-on workshops YouTube videos 2021-03-25 14:22:49 Thailand computer science -professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos 2020-12-11 19:45:35 United States of America - California text mining;Consumer Insights -student in-class, in lectures;in-class, in hands-on workshops 2020-12-14 08:52:29 Netherlands computer science;digital humanities;design -professor in-class, in lectures published literature 2021-04-07 05:02:58 Korea (Republic of) Smart Factory -student in-class, in lectures YouTube videos 2020-12-25 21:04:48 Croatia computer science -student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-04-19 15:31:54 United Kingdom computer science -student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-04-08 15:25:51 India HR Analytics -professor in-class, in hands-on workshops;outside the classroom YouTube videos;published literature 2020-12-10 16:33:04 United States of America - Pennsylvania Electrodynamics -professor in-class, in lectures;in-class, in hands-on workshops YouTube videos 2020-12-16 17:19:53 Canada - Quebec / Québec agronomy -student in-class, in hands-on workshops published literature 2021-04-13 13:53:44 Romania text mining -student in-class, in hands-on workshops 2021-03-25 23:57:48 Brazil computer science;text mining -professor outside the classroom YouTube videos 2020-12-09 17:48:41 Thailand computer science;biology -professor outside the classroom YouTube videos 2021-03-21 14:42:52 Brazil text mining -student outside the classroom YouTube videos 2021-03-20 21:45:37 India text mining -teaching assistant outside the classroom YouTube videos 2021-04-15 19:08:14 China Transportation data analysis -student in-class, in hands-on workshops lectures notes published on the Orange blog 2020-12-15 05:49:47 India computer science;text mining -professor in-class, in hands-on workshops YouTube videos 2021-03-30 20:43:35 France computer science -student outside the classroom YouTube videos 2021-03-23 11:28:40 Argentina computer science;text mining -teaching assistant in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2020-12-22 16:22:35 Germany text mining -student outside the classroom lectures notes published on the Orange blog 2021-04-08 13:22:44 India text mining -professor in-class, in lectures;outside the classroom lectures notes published on the Orange blog 2021-04-15 07:56:12 Korea (Republic of) computer science -professor in-class, in hands-on workshops lectures notes published on the Orange blog 2021-03-24 14:45:22 India computer science -student in-class, in lectures;in-class, in hands-on workshops;outside the classroom published literature 2021-04-15 04:44:38 India computer science -student in-class, in hands-on workshops YouTube videos 2021-03-29 14:36:24 United States of America - Ohio data science -student in-class, in hands-on workshops YouTube videos 2021-03-23 10:27:41 Singapore text mining -professor outside the classroom YouTube videos 2020-12-11 14:16:14 Indonesia computer science -teaching assistant in-class, in lectures;outside the classroom YouTube videos 2020-12-22 22:29:56 Japan text mining -student in-class, in lectures YouTube videos 2020-12-15 14:56:45 Indonesia computer science -student outside the classroom YouTube videos 2021-04-10 19:19:58 Italy biology -student in-class, in lectures 2021-03-22 12:51:18 United Kingdom computer science -student in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog;published literature 2021-04-12 18:44:38 Brazil text mining -student outside the classroom YouTube videos 2021-04-08 20:43:15 Brazil My personal work -professor in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2021-04-10 15:18:30 China - Taiwan Big data analysis -teaching assistant in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2020-12-10 01:18:39 Indonesia text mining -professor outside the classroom published literature 2020-12-23 21:06:49 Turkey biology -professor in-class, in lectures YouTube videos 2021-04-12 07:42:57 Korea (Republic of) Business Administration -professor in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2021-04-13 12:01:24 Oman computer science -teaching assistant outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-03-22 21:14:46 Canada - Ontario Geological Engineering -student outside the classroom YouTube videos 2021-04-19 18:31:40 Argentina computer science -professor in-class, in hands-on workshops published literature 2021-04-10 11:06:15 Russian Federation computer science -professor in-class, in hands-on workshops published literature 2020-12-10 13:19:53 Mexico computer science -professor in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2020-12-13 21:39:59 United States of America - Florida text mining;sport analytics -professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom 2021-03-19 21:39:43 Germany ethics in digital transformation -teaching assistant outside the classroom YouTube videos;published literature 2021-03-19 17:56:23 Hungary computer science;text mining;health management -student in-class, in lectures;in-class, in hands-on workshops;outside the classroom 2021-03-22 13:44:29 India data science -professor in-class, in lectures lectures notes published on the Orange blog 2020-12-10 21:41:44 Brazil industrial automation -student outside the classroom YouTube videos 2020-12-09 16:39:59 Spain text mining -student outside the classroom published literature 2020-12-18 18:13:38 Brazil biology -student outside the classroom lectures notes published on the Orange blog;published literature 2021-03-30 17:45:22 Brazil computer science;text mining -professor in-class, in hands-on workshops;outside the classroom YouTube videos 2021-03-25 14:34:24 Brazil computer science -professor in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2021-04-22 00:44:51 Portugal computer science diff --git a/orangecontrib/prototypes/widgets/tests/test_owsplit.py b/orangecontrib/prototypes/widgets/tests/test_owsplit.py deleted file mode 100644 index 34aa6f87..00000000 --- a/orangecontrib/prototypes/widgets/tests/test_owsplit.py +++ /dev/null @@ -1,328 +0,0 @@ -# pylint: disable=missing-docstring,unsubscriptable-object -import os -import unittest - -import numpy as np - -from Orange.data import Table, StringVariable, Domain, DiscreteVariable -from Orange.widgets.tests.base import WidgetTest - -from orangecontrib.prototypes.widgets.owsplit import \ - OWSplit, SplitColumnOneHot, get_substrings, OneHotStrings, \ - DiscreteEncoding, SplitColumnCounts, CountStrings - - -class TestComputation(unittest.TestCase): - def setUp(self): - domain = Domain( - [ - DiscreteVariable("x", values=("a c d c bb bb bb", "bb d")) - ], - None, - [ - StringVariable("foo"), - StringVariable("bar") - ]) - self.data = Table.from_numpy( - domain, - np.array([[1], [0], [np.nan]]), None, - [["a,bbb,d,a,a", "e;f o"], ["", "f o"], ["bbb,d,bbb", "e;a;o"]] - ) - - -class TestSplitColumn(TestComputation): - def test_get_string_values(self): - np.testing.assert_equal( - set(get_substrings({"a bc", "d,e", "", "f,a t", "t"}, " ")), - {"a", "bc", "d,e", "f,a", "t"}) - np.testing.assert_equal( - set(get_substrings({"a bc", "d,e", "", "f,a t", "t"}, ",")), - {"a bc", "d", "e", "f", "a t", "t"}) - - def test_split_column_one_hot(self): - sc = SplitColumnOneHot(self.data, self.data.domain.metas[0], ",") - shared = sc(self.data) - self.assertEqual(set(sc.new_values), {"a", "bbb", "d"}) - self.assertEqual(set(shared), set(sc.new_values)) - np.testing.assert_equal(shared["a"], [0]) - np.testing.assert_equal(shared["bbb"], [0, 2]) - np.testing.assert_equal(shared["d"], [0, 2]) - - sc = SplitColumnOneHot(self.data, self.data.domain.metas[1], ";") - shared = sc(self.data) - self.assertEqual(set(sc.new_values), {"a", "e", "f o", "o"}) - self.assertEqual(set(shared), set(sc.new_values)) - np.testing.assert_equal(shared["a"], [2]) - np.testing.assert_equal(shared["e"], [0, 2]) - np.testing.assert_equal(shared["f o"], [0, 1]) - np.testing.assert_equal(shared["o"], [2]) - - def test_split_column_counts(self): - sc = SplitColumnCounts(self.data, self.data.domain.metas[0], ",") - shared = sc(self.data) - self.assertEqual(set(sc.new_values), {"a", "bbb", "d"}) - self.assertEqual(set(shared), set(sc.new_values)) - np.testing.assert_equal(shared["a"], [3, 0, 0]) - np.testing.assert_equal(shared["bbb"], [1, 0, 2]) - np.testing.assert_equal(shared["d"], [1, 0, 1]) - - def test_no_known_values(self): - sc = SplitColumnOneHot(self.data, self.data.domain.metas[0], ",") - data = Table.from_numpy( - self.data.domain, np.zeros((3, 1)), None, - np.array([["x"] * 2] * 3)) - shared = sc(data) - for attr in ("a", "bbb", "d"): - self.assertEqual(shared[attr].size, 0) - oh = OneHotStrings(sc, attr) - np.testing.assert_equal(oh(data), [0, 0, 0]) - -class TestStringEncoding(TestComputation): - def test_one_hot_strings(self): - attr = self.data.domain.metas[0] - sc = SplitColumnOneHot(self.data, attr, ",") - - oh = OneHotStrings(sc, "a") - np.testing.assert_equal(oh(self.data), [1, 0, 0]) - - oh = OneHotStrings(sc, "bbb") - np.testing.assert_equal(oh(self.data), [1, 0, 1]) - - data = Table.from_numpy( - Domain([], None, [attr]), - np.zeros((5, 0)), None, - np.array(["bbb,x,y", "", "bbb", "bbb,a", "foo"])[:, None]) - np.testing.assert_equal(oh(data), [1, 0, 1, 1, 0]) - - def test_count_strings(self): - attr = self.data.domain.metas[0] - sc = SplitColumnCounts(self.data, attr, ",") - - oh = CountStrings(sc, "a") - np.testing.assert_equal(oh(self.data), [3, 0, 0]) - - oh = CountStrings(sc, "bbb") - np.testing.assert_equal(oh(self.data), [1, 0, 2]) - - oh = CountStrings(sc, "d") - np.testing.assert_equal(oh(self.data), [1, 0, 1]) - - -class TestDiscreteEncoding(TestComputation): - def test_one_hot_discrete(self): - attr = self.data.domain.attributes[0] - - oh = DiscreteEncoding(attr, " ", True, "a") - np.testing.assert_equal(oh(self.data), [0, 1, np.nan]) - - oh = DiscreteEncoding(attr, " ", True, "d") - np.testing.assert_equal(oh(self.data), [1, 1, np.nan]) - - data = Table.from_numpy( - Domain([attr], None), - np.array([1, 0, 1, 0, np.nan])[:, None]) - - oh = DiscreteEncoding(attr, " ", True, "a") - np.testing.assert_equal(oh(data), [0, 1, 0, 1, np.nan]) - - oh = DiscreteEncoding(attr, " ", True, "d") - np.testing.assert_equal(oh(data), [1, 1, 1, 1, np.nan]) - - def test_discrete_counts(self): - attr = self.data.domain.attributes[0] - - oh = DiscreteEncoding(attr, " ", False, "a") - np.testing.assert_equal(oh(self.data), [0, 1, np.nan]) - oh = DiscreteEncoding(attr, " ", False, "bb") - np.testing.assert_equal(oh(self.data), [1, 3, np.nan]) - with self.data.unlocked(): - self.data.X[2, 0] = 0 - np.testing.assert_equal(oh(self.data), [1, 3, 3]) - - def test_discrete_metas(self): - attr = DiscreteVariable("x", values=("a c d", "bb d")) - domain = Domain([], None, [attr]) - data = Table.from_numpy(domain, np.zeros((3, 0)), None, - np.array([1, 0, np.nan])[:, None]) - oh = DiscreteEncoding(attr, " ", True, "a") - np.testing.assert_equal(oh(data), [0, 1, np.nan]) - - - -class TestOWSplit(WidgetTest): - def setUp(self): - self.widget = self.create_widget(OWSplit) - test_path = os.path.dirname(os.path.abspath(__file__)) - self.data = Table.from_file(os.path.join(test_path, "orange-in-education.tab")) - self._create_simple_corpus() - - def _set_attr(self, attr, widget=None): - if widget is None: - widget = self.widget - attr_combo = widget.controls.attribute - idx = attr_combo.model().indexOf(attr) - attr_combo.setCurrentIndex(idx) - attr_combo.activated.emit(idx) - - def _create_simple_corpus(self) -> None: - """ - Create a simple dataset with 4 documents. - """ - metas = np.array( - [ - ["foo,"], - ["bar,baz , bar, bar"], - ["foo,bar, foo"], - [""], - ] - ) - text_var = StringVariable("foo") - domain = Domain([], metas=[text_var]) - self.small_table = Table.from_numpy( - domain, - X=np.empty((len(metas), 0)), - metas=metas, - ) - - def test_data(self): - """Basic functionality""" - self.send_signal(self.widget.Inputs.data, self.data) - self._set_attr(self.data.domain.attributes[1]) - output = self.get_output(self.widget.Outputs.data) - self.assertEqual(len(output.domain.attributes), - len(self.data.domain.attributes) + 3) - self.assertTrue("in-class, in hands-on workshops" in output.domain - and "in-class, in lectures" in output.domain and - "outside the classroom" in output.domain) - np.testing.assert_array_equal(output[:10, "in-class, in hands-on " - "workshops"], - np.array([0, 0, 1, 0, 1, 1, 0, 1, 0, 0] - ).reshape(-1, 1)) - np.testing.assert_array_equal(output[:10, "in-class, in lectures"], - np.array([0, 1, 0, 0, 1, 0, 1, 1, 1, 0] - ).reshape(-1, 1)) - np.testing.assert_array_equal(output[:10, "outside the classroom"], - np.array([1, 0, 1, 1, 1, 0, 0, 1, 1, 1] - ).reshape(-1, 1)) - def test_empty_data(self): - """Do not crash on empty data""" - self.send_signal(self.widget.Inputs.data, None) - - def test_discrete(self): - """No crash on data attributes of different types""" - self.send_signal(self.widget.Inputs.data, self.data) - self.assertEqual(self.widget.attribute, self.data.domain.metas[1]) - self._set_attr(self.data.domain.attributes[1]) - self.assertEqual(self.widget.attribute, self.data.domain.attributes[1]) - - def test_numeric_only(self): - """Error raised when only numeric variables given""" - housing = Table.from_file("housing") - self.send_signal(self.widget.Inputs.data, housing) - self.assertTrue(self.widget.Warning.no_disc.is_shown()) - - def test_split_nonexisting(self): - """Test splitting when delimiter doesn't exist""" - self.widget.delimiter = "|" - self.send_signal(self.widget.Inputs.data, self.data) - new_cols = set(self.data.get_column("Country")) - self.assertFalse(any(self.widget.delimiter in v for v in new_cols)) - self.assertEqual(len(self.get_output( - self.widget.Outputs.data).domain.attributes), - len(self.data.domain.attributes) + len(new_cols)) - - def test_output_string(self): - "Test outputs; at the same time, test for duplicate variables" - self.widget.delimiter = "," - self.send_signal(self.widget.Inputs.data, self.small_table) - out = self.get_output(self.widget.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["bar", "baz", "foo (1)"]) - np.testing.assert_equal(out.X, - [[0, 0, 1], - [1, 1, 0], - [1, 0, 1], - [0, 0, 0]]) - - def test_output_discrete(self): - w = self.widget - w.delimiter = " " - w.output_type = w.Categorical - - attr = DiscreteVariable( - "x", - values=("bar foo bar bar foo foo foo", "bar baz", "crux crux")) - data = Table.from_numpy( - Domain([attr], None), - np.array([1, 1, 0, 1, 2, np.nan])[:, None], None) - - counts = np.array([[1, 1, 0, 0], - [1, 1, 0, 0], - [3, 0, 0, 4], - [1, 1, 0, 0], - [0, 0, 2, 0], - [np.nan, np.nan, np.nan, np.nan]]) - exp_hot = np.hstack((data.X, np.vstack((counts[:-1] > 0, [[np.nan] * 4])))) - - self.send_signal(w.Inputs.data, data) - out = self.get_output(w.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["x", "bar", "baz", "crux", "foo"]) - for attr in out.domain.attributes[1:]: - self.assertTrue(attr.is_discrete) - self.assertEqual(attr.values, ("No", "Yes")) - np.testing.assert_equal(out.X, exp_hot) - - w.controls.output_type.buttons[w.Numerical].click() - out = self.get_output(w.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["x", "bar", "baz", "crux", "foo"]) - for attr in out.domain.attributes[1:]: - self.assertTrue(attr.is_continuous) - np.testing.assert_equal(out.X, exp_hot) - - w.controls.output_type.buttons[w.Counts].click() - out = self.get_output(w.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["x", "bar", "baz", "crux", "foo"]) - for attr in out.domain.attributes[1:]: - self.assertTrue(attr.is_continuous) - np.testing.assert_equal( - out.X, - np.hstack((data.X, np.vstack((counts[:-1], [[np.nan] * 4]))))) - - def test_output_types_string(self): - w = self.widget - w.delimiter = "," - w.output_type = w.Categorical - - self.send_signal(w.Inputs.data, self.small_table) - counts = np.array([[0, 0, 1], [3, 1, 0], [1, 0, 2], [0, 0, 0]]) - - out = self.get_output(w.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["bar", "baz", "foo (1)"]) - for attr in out.domain.attributes: - self.assertTrue(attr.is_discrete) - self.assertEqual(attr.values, ("No", "Yes")) - np.testing.assert_equal(out.X, counts > 0) - - w.controls.output_type.buttons[w.Numerical].click() - out = self.get_output(w.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["bar", "baz", "foo (1)"]) - for attr in out.domain.attributes: - self.assertTrue(attr.is_continuous) - np.testing.assert_equal(out.X, counts > 0) - - w.controls.output_type.buttons[w.Counts].click() - out = self.get_output(w.Outputs.data) - self.assertEqual([attr.name for attr in out.domain.attributes], - ["bar", "baz", "foo (1)"]) - for attr in out.domain.attributes: - self.assertTrue(attr.is_continuous) - np.testing.assert_equal(out.X, counts) - - -if __name__ == "__main__": - unittest.main()