Golang实现词频统计

本例使用golang实现词频统计。步骤:

(1)从文件中读取1篇文章。

(2)统计词频,按单词出现的频率从大到小进行排序。

(3)写入到文件中。

注:任何非英文字母的符号均认为是单词分隔符(即同等于空格)。

效力:使用本程序统计1篇150W单词的文章,大约需要70ms.

1.核心代码:

package wordtest

import (
"bytes"
"fmt"
"io/ioutil"
"os"
"runtime"
"sort"
"strings"
"time"
)

//简单的词频统计任务
func CountTestBase(inputFilePath string, outputFilePath string) {
//时间开始点
start := time.Now().UnixNano() / 1e6

//读取文件
fileData, err := ioutil.ReadFile(inputFilePath)
CheckError(err, "read file")
var fileText string = string(fileData)

//根据CPU核数新开协程
newRountineCount := runtime.NumCPU()*2 – 1
runtime.GOMAXPROCS(newRountineCount + 1)
//切分文件
parts := splitFileText(fileText, newRountineCount)

var ch chan map[string]int = make(chan map[string]int, newRountineCount)
for i := 0; i < newRountineCount; i++ {
go countTest(parts[i], ch)
}

//主线程接收数据
var totalWordsMap map[string]int = make(map[string]int, 0)
completeCount := 0
for {
receiveData := <-ch
for k, v := range receiveData {
totalWordsMap[strings.ToLower(k)] += v
}
completeCount++

if newRountineCount == completeCount {
break
}
}

//添加进slice,并排序
list := make(WordCountBeanList, 0)
for k, v := range totalWordsMap {
list = append(list, NewWordCountBean(k, v))
}
sort.Sort(list)
//时间结束点
end := time.Now().UnixNano() / 1e6
fmt.Printf("time consume:%dms
", end-start)

//输出
wordsCount := list.totalCount()
var data bytes.Buffer
data.WriteString(fmt.Sprintf("程序履行:%dms
", end-start))
data.WriteString(fmt.Sprintf("文章总单词数:%d

", wordsCount))
for _, v := range list {
var percent float64 = 100.0 * float64(v.count) / float64(wordsCount)
_, err := data.WriteString(fmt.Sprintf("%s: %d, %3.2f%%
", v.word, v.count, percent))
CheckError(err, "bytes.Buffer, WriteString")
}

err = ioutil.WriteFile(outputFilePath, []byte(data.String()), os.ModePerm)
CheckError(err, "ioutil.WriteFile")
}

func countTest(text string, ch chan map[string]int) {
var wordMap map[string]int = make(map[string]int, 0)

//按字母读取,除26个字母(大小写)以外的所有字符均认为是分隔符
startIndex := 0
letterStart := false
for i, v := range text {
if (v >= 65 && v <= 90) || (v >= 97 && v <= 122) {
if !letterStart {
letterStart = true
startIndex = i
}
} else {
if letterStart {
wordMap[text[startIndex:i]]++
letterStart = false
}
}
}

//最后1个单词
if letterStart {
wordMap[text[startIndex:]]++
}
ch <- wordMap
}

//将全文分成n段
func splitFileText(fileText string, n int) []string {
length := len(fileText)
parts := make([]string, n)

lastPostion := 0
for i := 0; i < n⑴; i++ {
position := length / n * (i + 1)
for string(fileText[position]) != " " {
position++
}

parts[i] = fileText[lastPostion:position]
lastPostion = position
}

//最后1段
parts[n⑴] = fileText[lastPostion:]
return parts
}

func CheckError(err error, msg string) {
if err != nil {
panic(msg + "," + err.Error())
}
}

2.1个struct

package wordtest

type WordCountBean struct {
word string
count int
}

func NewWordCountBean(word string, count int) *WordCountBean {
return &WordCountBean{word, count}
}

type WordCountBeanList []*WordCountBean

func (list WordCountBeanList) Len() int {
return len(list)
}

func (list WordCountBeanList) Less(i, j int) bool {
if list[i].count > list[j].count {
return true
} else if list[i].count < list[j].count {
return false
} else {
return list[i].word < list[j].word
}
}

func (list WordCountBeanList) Swap(i, j int) {
var temp *WordCountBean = list[i]
list[i] = list[j]
list[j] = temp
}

func (list WordCountBeanList) totalCount() int {
totalCount := 0
for _, v := range list {
totalCount += v.count
}

return totalCount
}

3.主函数:

package main

import (
"WordsTest/wordtest"
)

func main() {
inputFilePath := "files/article.txt"
outputFilePath := "files/hanjun-result.txt"

wordtest.CountTestBase(inputFilePath, outputFilePath)
}

波比源码 – 精品源码模版分享 | www.bobi11.com
1. 本站所有资源来源于用户上传和网络,如有侵权请邮件联系站长!
2. 分享目的仅供大家学习和交流,您必须在下载后24小时内删除!
3. 不得使用于非法商业用途,不得违反国家法律。否则后果自负!
4. 本站提供的源码、模板、插件等等其他资源,都不包含技术服务请大家谅解!
5. 如有链接无法下载、失效或广告,请联系管理员处理!
6. 本站资源售价只是赞助,收取费用仅维持本站的日常运营所需!
7. 如遇到加密压缩包,请使用WINRAR解压,如遇到无法解压的请联系管理员!

波比源码 » Golang实现词频统计

发表评论

Hi, 如果你对这款模板有疑问,可以跟我联系哦!

联系站长
赞助VIP 享更多特权,建议使用 QQ 登录
喜欢我嘛?喜欢就按“ctrl+D”收藏我吧!♡