怎么把 MySQL 的 records 按条转换为语料库的文档？现在被合成了一个文档，没办法聚类

V2EX = way to explore

V2EX 是一个关于分享和探索的地方

现在注册

已注册用户请登录

这是一个创建于 3966 天前的主题，其中的信息可能已经有所发展或是发生改变。

	# MySQL
	# 先安装 RMySQL，相当折腾 http://lsfalimis.github.io/link--install-rmysql-on-mavericks/
	library(RMySQL)
	con = dbConnect(MySQL(), user="USERNAME", password="PASSWORD", dbname="DATABASENAME", host="HOST")
	# 为了使中文不再显示为问号
	dbSendQuery(con, 'set names utf8')
	# COLUMN 是微博文字
	rs = dbSendQuery(con, "select COLUMN from TABLE limit 100")
	data = fetch(rs, n=-1)

	# Clean
	# 去网址
	data = gsub(pattern="http:[a-zA-Z\\/\\.0-9]+","",data)

	# Segment
	library("Rwordseg")
	# 对每条微博进行分词
	corpus <- lapply(X=data, FUN=segmentCN)

	# Covert into corpus
	library(tm)
	# 把向量 corpus 转化成语料库
	doc.cor = Corpus(VectorSource(corpus))
	# 微博有很多英文词汇，所以把英文也考虑在内。在去英文 stop words 之前先转化为全字母小写
	doc.cor = tm_map(doc.cor, tolower)
	# 去数字
	doc.cor = tm_map(doc.cor, removeNumbers)
	# 之前的清理步骤会产生多余空格，在这里去掉多余空格
	doc.cor = tm_map(doc.cor, stripWhitespace)
	doc.cor = tm_map(doc.cor, PlainTextDocument)
	stopwordsCN = readLines("stopwordsCN.txt")
	# 默认最小单词长度为3，这里调整为1
	control = list(stopwords = stopwordsCN, wordLengths = c(1, Inf))
	# Term为第一列
	doc.dtm <- TermDocumentMatrix(doc.cor, control)
	# Document为第一列
	# doc.dtm <- DocumentTermMatrix(doc.cor, control)
	inspect(doc.dtm)

view raw weibo.r hosted with ❤ by GitHub

目前尚无回复