-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathTextPreprocessing.R
More file actions
45 lines (37 loc) · 1.87 KB
/
TextPreprocessing.R
File metadata and controls
45 lines (37 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
preprocess<-function(input.filename){
options(stringsAsFactors = FALSE) #a default option that we need to change
print("Reading in data")
#The 10 most populus classes, and the ones we'll use for evaluation
populus = c("topic.earn","topic.acq", "topic.money.fx", "topic.grain", "topic.crude", "topic.trade", "topic.interest", "topic.ship", "topic.wheat", "topic.corn") #make sure we don't use factors for strings as default
#get in the data
input.raw <- read.csv(file=input.filename,header=T,sep=",")
#this will hold everything we're outputting
output.df <- NULL
#find the columns that identify the topics
topicColumns <-grep("topic",attributes(input.raw)$names,ignore.case = TRUE, value = FALSE)
for(i in 1:nrow(input.raw)){
#Find the number of topics associated with this document
numTopics <- sum(input.raw[i,topicColumns])
#if this document has topics associated and contains text (otherwise this document will get dropped)
if (numTopics > 0 && input.raw$doc.text[i]!= ""){
for(j in topicColumns){
if(input.raw[i,j] == 1){
#take each row and create a new document for each topic, and use the actual name of the topic
oldrow<-input.raw[i,]
newrow <- data.frame(attributes(oldrow[j])$names,oldrow$doc.title,oldrow$doc.text)
#add this row
output.df <-rbind(output.df,newrow)
}
}
}
}
names(output.df)<- list("topic","title","text")
#choose the documents that have the 10 most popular topics
output.df <- subset(output.df, subset = topic %in% populus)
#shuffle up the instances for bias free k fold
output.df <- output.df[sample(1:nrow(output.df),size=nrow(output.df),replace=FALSE),]
#we want the topic to be a factor
output.df$topic <- as.factor(output.df$topic)
#return a dataframe with the topic, title and text for the correct documents
return(output.df)
}