Search-Engine-FS/vectorizer_pipeline.py at master · prajwalkk/Search-Engine-FS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import glob
import re
import string
import os
from pathlib import Path
import joblib
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from sklearn.metrics.pairwise import cosine_similarity
spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])


lemmatizer = nltk.stem.WordNetLemmatizer()


def lemmatize_text(text_tokens):
    return [lemmatizer.lemmatize(w) for w in text_tokens]


# def tokenize(doc):
#     doc_1 = doc.translate(doc.maketrans('', "", string.punctuation))
#     word_tokens = nltk.word_tokenize(doc_1)
#     no_stop_doc = [w for w in word_tokens if w.isalpha()]
#     lemmatized_doc = lemmatize_text(no_stop_doc)
#     return no_stop_doc

def tokenize(doc):
    doc_1 = spacy_nlp(doc)
    no_stop_doc = [
        token.lemma_ for token in doc_1 if not token.is_stop and not token.is_punct]
    return no_stop_doc


data_array = []
currpath = Path(__file__).parent
files = glob.glob("./DataFiles/CrawledData/20200510/*")
print(len(files))
for file in files:
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        data_array.append([re.sub('./DataFiles/CrawledData/20200510/', '', file),
                           re.sub(' +', ' ', f.read())])

df = pd.DataFrame(data_array, columns=['File', 'Contents'])
df = pd.DataFrame(data_array, columns=['File', 'Contents'])
df['Link'] = df['Contents'].apply(lambda x: x.split("\n")[0])
df['Doc'] = df['Contents'].apply(lambda x: x.split("\n")[1])

vectorizer = TfidfVectorizer(tokenizer=tokenize,
                             strip_accents='ascii',
                             ngram_range=(1, 3),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_features=160000,
                             sublinear_tf=True)

tfidfs = vectorizer.fit_transform(df['Contents'])

joblib.dump(vectorizer, currpath / 'DataFiles/vectorizer.joblib')
joblib.dump(tfidfs, currpath / 'DataFiles/tfidf.joblib')

df.to_pickle(currpath / 'DataFiles/dataFrame_bk.pkl')