|
# MySQL |
|
# 先安装 RMySQL,相当折腾 http://lsfalimis.github.io/link--install-rmysql-on-mavericks/ |
|
library(RMySQL) |
|
con = dbConnect(MySQL(), user="USERNAME", password="PASSWORD", dbname="DATABASENAME", host="HOST") |
|
# 为了使中文不再显示为问号 |
|
dbSendQuery(con, 'set names utf8') |
|
# COLUMN 是微博文字 |
|
rs = dbSendQuery(con, "select COLUMN from TABLE limit 100") |
|
data = fetch(rs, n=-1) |
|
|
|
# Clean |
|
# 去网址 |
|
data = gsub(pattern="http:[a-zA-Z\\/\\.0-9]+","",data) |
|
|
|
# Segment |
|
library("Rwordseg") |
|
# 对每条微博进行分词 |
|
corpus <- lapply(X=data, FUN=segmentCN) |
|
|
|
# Covert into corpus |
|
library(tm) |
|
# 把向量 corpus 转化成语料库 |
|
doc.cor = Corpus(VectorSource(corpus)) |
|
# 微博有很多英文词汇,所以把英文也考虑在内。在去英文 stop words 之前先转化为全字母小写 |
|
doc.cor = tm_map(doc.cor, tolower) |
|
# 去数字 |
|
doc.cor = tm_map(doc.cor, removeNumbers) |
|
# 之前的清理步骤会产生多余空格,在这里去掉多余空格 |
|
doc.cor = tm_map(doc.cor, stripWhitespace) |
|
doc.cor = tm_map(doc.cor, PlainTextDocument) |
|
stopwordsCN = readLines("stopwordsCN.txt") |
|
# 默认最小单词长度为3,这里调整为1 |
|
control = list(stopwords = stopwordsCN, wordLengths = c(1, Inf)) |
|
# Term为第一列 |
|
doc.dtm <- TermDocumentMatrix(doc.cor, control) |
|
# Document为第一列 |
|
# doc.dtm <- DocumentTermMatrix(doc.cor, control) |
|
inspect(doc.dtm) |