Skip to content

Commit eb4fe73

Browse files
committed
add html stripper
Signed-off-by: Lejin Varghese <lejinsnests@gmail.com>
1 parent 6a0cca3 commit eb4fe73

3 files changed

Lines changed: 15 additions & 4 deletions

File tree

deep_learning/moe/README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,11 @@
77

88
## Datasets
99
- [x] Home Depot
10-
- [ ] Amazon
10+
- [x] Amazon
1111
- [ ] Google
1212
- [ ] Wayfair
1313
- [ ] Crowdflower
14-
- [ ] Walmart
14+
- [ ] Walmart
15+
16+
## Improvements
17+
- [ ] Add more Amazon metadata

deep_learning/moe/adapters/aggregator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class DatasetAggregator:
1313
def __init__(
1414
self,
1515
sample_size: Optional[int] = None,
16-
splits: list[str] = ["train", "valid", "test"],
16+
splits: list[str] = ["train", "test"],
1717
):
1818
self.sources = [AmazonDataset, HomeDepotDataset]
1919
self.sample_size = sample_size

deep_learning/moe/adapters/core.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from abc import ABC
22
from multiprocessing import cpu_count
33
import json
4+
import re
45
from click import secho
56
from datasets import load_dataset, Dataset
67

@@ -38,6 +39,13 @@ def generate_query(self):
3839
def generate_document(self):
3940
pass
4041

42+
@staticmethod
43+
def strip_html(text):
44+
if not isinstance(text, str):
45+
return ""
46+
clean = re.compile("<.*?>")
47+
return re.sub(clean, "", text)
48+
4149
@staticmethod
4250
def format_document(**kwargs):
4351
if kwargs.get("title"):
@@ -53,7 +61,7 @@ def format_document(**kwargs):
5361

5462
if kwargs.get("description"):
5563
template += f"""**product description**: {kwargs.get('description')}"""
56-
return template.strip().lower()
64+
return BaseDataset.strip_html(template.strip().lower())
5765

5866
def load(self, split: str, cols: list[str] = None):
5967
secho(

0 commit comments

Comments
 (0)