1 package chapter07
2
3 object Test18_ComplexWordCount {
4 def main(args: Array[String]): Unit = {
5 val tupleList: List[(String, Int)] = List(
6 ("hello", 1),
7 ("hello world", 2),
8 ("hello scala", 3),
9 ("hello spark from scala", 1),
10 ("hello flink from scala", 2)
11 )
12
13 // 思路一:直接展开为普通版本
14 val newStringList: List[String] = tupleList.map(
15 kv => {
16 (kv._1.trim + " ") * kv._2
17 }
18 )
19 println(newStringList)
20
21 // 接下来操作与普通版本完全一致
22 val wordCountList: List[(String, Int)] = newStringList
23 .flatMap(_.split(" ")) // 空格分词
24 .groupBy( word => word ) // 按照单词分组
25 .map( kv => (kv._1, kv._2.size) ) // 统计出每个单词的个数
26 .toList
27 .sortBy(_._2)(Ordering[Int].reverse)
28 .take(3)
29
30 println(wordCountList)
31
32 println("================================")
33
34 // 思路二:直接基于预统计的结果进行转换
35 // 1. 将字符串打散为单词,并结合对应的个数包装成二元组List((hello,1), (hello,2), (world,2), (hello,3), (scala,3), (
36 val preCountList: List[(String, Int)] = tupleList.flatMap(
37 tuple => {
38 val strings: Array[String] = tuple._1.split(" ")
39 strings.map( word => (word, tuple._2) )
40 }
41 )
42 println(preCountList)
43
44 // 2. 对二元组按照单词进行分组
45 val preCountMap: Map[String, List[(String, Int)]] = preCountList.groupBy( _._1 )
46 println(preCountMap)
47
48 // 3. 叠加每个单词预统计的个数值
49 val countMap: Map[String, Int] = preCountMap.mapValues(
50 tupleList => tupleList.map(_._2).sum
51 )
52 println(countMap)
53
54 // 4. 转换成list,排序取前3
55 val countList = countMap.toList
56 .sortWith(_._2 > _._2)
57 .take(3)
58 println(countList)
59 }
60