页面树结构

2017-07-25 Apache Spark 2.2.0 官方文档中文版发布 : http://spark.apachecn.org/docs/cn/2.2.0/


MachineLearning 优酷地址 : http://i.youku.com/apachecn

转至元数据结尾
转至元数据起始

Hive Demo

1. 初始环境

create table if not exists tmp_words (
    id int,
    word string
)
row format delimited fields 
terminated by " " 
lines terminated by "\n";
insert into tmp_words values(6,'nihao'), (7,'nihao'), (8,'nihao'), (9,'nihao');
> build.sbt
name := "MTSpark"
version := "1.0"
scalaVersion := "2.11.7"
libraryDependencies ++= Seq(
  "org.apache.spark" %% "spark-core" % "1.6.1",
  "org.apache.spark" %% "spark-streaming" % "1.6.1",
  "org.apache.spark" % "spark-sql_2.11" % "1.6.1",
  "org.apache.spark" % "spark-hive_2.11" % "1.6.1"
)

2. 配置conf对象

1)配置sparkconf对象

import org.apache.spark.{SparkConf, SparkContext}
val sparkConf = new SparkConf().setAppName("HiveFromSpark")
val sc = new SparkContext(sparkConf)

2) 配置hiveconf对象

import org.apache.spark.sql.hive.HiveContext
//A hive context adds support for finding tables in the MetaStore and writing queries using HiveQL.
val hiveContext = new HiveContext(sc)

3. 执行sql操作

import hiveContext.sql
sql("create table if not exists tmp_words (id int, word string)")
sql(s"load data local inpath '${kv1File.getAbsolutePath}' into table tmp_words")
val count = sql("select count(*) from tmp_words").collect().head.getLong(0)
// s用与设置参入参数标识
println(s"总共: $count 数据")

import hiveContext.implicits._
// 创建rdd临时表
case class Record(id: Int, word: String)
val rdd = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i")))
rdd.toDF().registerTempTable("records")

println("Result of SELECT *: 表的join操作")
sql("select * from records r join tmp_words s on r.id = s.id").collect().foreach(println)

import org.apache.spark.sql._
println("Result of RDD.map:")
// sql查询结果 rdd遍历
val rddFromSql = sql("select id, word from tmp_words")
val rddAsStrings = rddFromSql.map {
    case Row(id: Int, word: String) => s"Key: $id, Value: $word"
}

 

 

  • 无标签