-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathml1.py
More file actions
95 lines (78 loc) · 3.08 KB
/
ml1.py
File metadata and controls
95 lines (78 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
import numpy as np
import jieba
import jieba.analyse
from collections import Counter
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
import warnings
# 加载数据
csv_file_path = './zhenmuyesv.csv'
df = pd.read_csv(csv_file_path)
print(sklearn.__version__)
warnings.filterwarnings("ignore", category=NotOpenSSLWarning)
# 数据预处理
# 假设文本列名为 'text'
def preprocess_text(content):
# 这里可以添加更复杂的文本预处理步骤
return " ".join(jieba.cut(content))
df['processed_text'] = df['content'].apply(preprocess_text)
# 词频分析
def get_top_n_words(corpus, n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
return words_freq[:n]
top_words = get_top_n_words(df['processed_text'], 10)
# 情感分析 (示例,需要进一步实现)
def sentiment_analysis(content):
# 这里需要一个中文情感分析模型
return np.random.choice(['Positive', 'Negative', 'Neutral'])
df['sentiment'] = df['processed_text'].apply(sentiment_analysis)
# 主题建模
def lda_topic_modeling(corpus):
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(corpus)
# 构建LDA模型
lda_model = LatentDirichletAllocation(n_components=5, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
return lda_model, vectorizer.get_feature_names(), lda_Z
lda_model, lda_feature_names, lda_Z = lda_topic_modeling(df['processed_text'])
# 数据可视化
# 词云
def plot_wordcloud(word_freq):
wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(dict(word_freq))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
plot_wordcloud(dict(top_words))
# 词频直方图
def plot_top_words(word_freq):
words = [word for word, freq in word_freq]
freqs = [freq for word, freq in word_freq]
plt.figure(figsize=(10, 5))
plt.bar(words, freqs)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.title('Top Words in Corpus')
plt.show()
plot_top_words(top_words)
# 主题可视化
def plot_top_topics(lda_model, feature_names, n_top_words=10):
for topic_idx, topic in enumerate(lda_model.components_):
print(f"Topic #{topic_idx + 1}:")
print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
# 这里可以进一步增加可视化代码
# 显示主题
plot_top_topics(lda_model, lda_feature_names)
# 注意:由于这里是静态的示例,某些功能(如情感分析)需要您根据实际情况进行实现。