xtractor/example.py at master · nagypeterjob/xtractor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from gensim.models import Word2Vec
from xtractor import TopicExtractor as te
import pandas as pd

#hu.bin supposed to be a genism compatible word vector
model = Word2Vec.load("hu/hu.bin")

'''
This list contains your label names.
You need to choose keywords carefully as they a the hearth and soul of the whole topic extraction.
'''
categories = [
            {
                "name": "economy",
                "keywords": ['money', 'bussiness', 'used', 'economy','credit', 'growth', 'entrepreneur', 'euro']
            },
            {
                "name": "sport",
                "keywords": ['ball','car' ,'rank','match', 'game', 'fan', 'stadium', 'sport', 'run']
            }
        ]

'''
Example DataFrame.
These are supposed to be tokenized sentences, hence the list form.
'''
df = pd.DataFrame({"Text": [

    ['Poulter', 'wins', 'Houston', 'play-off', 'clinch', 'final', 'Masters spot'], #sport
    ['General', 'Principle', 'wins', 'Irish', 'Grand', 'National'], #sport
    ['Facebook', 'chief', 'fires', 'back', 'Apple', 'boss'] #economy
]})

extractor = te.TopicExtractor([model], categories)
results = extractor.extract(df)

print(results)