-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathexample.py
More file actions
37 lines (31 loc) · 1.13 KB
/
example.py
File metadata and controls
37 lines (31 loc) · 1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from gensim.models import Word2Vec
from xtractor import TopicExtractor as te
import pandas as pd
#hu.bin supposed to be a genism compatible word vector
model = Word2Vec.load("hu/hu.bin")
'''
This list contains your label names.
You need to choose keywords carefully as they a the hearth and soul of the whole topic extraction.
'''
categories = [
{
"name": "economy",
"keywords": ['money', 'bussiness', 'used', 'economy','credit', 'growth', 'entrepreneur', 'euro']
},
{
"name": "sport",
"keywords": ['ball','car' ,'rank','match', 'game', 'fan', 'stadium', 'sport', 'run']
}
]
'''
Example DataFrame.
These are supposed to be tokenized sentences, hence the list form.
'''
df = pd.DataFrame({"Text": [
['Poulter', 'wins', 'Houston', 'play-off', 'clinch', 'final', 'Masters spot'], #sport
['General', 'Principle', 'wins', 'Irish', 'Grand', 'National'], #sport
['Facebook', 'chief', 'fires', 'back', 'Apple', 'boss'] #economy
]})
extractor = te.TopicExtractor([model], categories)
results = extractor.extract(df)
print(results)