本节介绍云数据库ClickHouse的经典场景示例,示例数据集由一个表组成,这个表包含有关Yandex.Metrica的匹配(hits_v1)的匿名数据。数据集可以作为压缩的tsv.xz文件下载或作为准备好的分区下载。 下载TSV文件,并且将数据导入数据库。 如果直接下载tsv数据较慢,可以先使用其他工具或源下载数据文件,再使用xz -d hits_v1.tsv.xz 命令解压 ,然后再导入。
单节点示例
下载数据包并解压
curl https://clickhouse-datasets.s3.yandex.net/hits/tsv/hits_v1.tsv.xz | unxz --threads= `nproc` > hits_v1.tsv
创建数据库
clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
创建表
clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192";
导入数据
cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000
统计数据量
clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1"
两节点单副本示例
命令行客户端连接集群任意节点,连接命令如下:
clickhouse-client -h A.A.A.A --port 8123 --user username --password yourPassword
创建数据库和表
CREATE DATABASE IF NOT EXISTS datasets ON CLUSTER 'inst_f5040618_2shards_1replicas'
创建本地表 hits_v1 建表语句添加ON CLUSTER 关键字后,在任意节点上执行建表语句都会为集群所有节点创建该表。
CREATE TABLE datasets.hits_v1 ON CLUSTER 'inst_f5040618_2shards_1replicas' ( WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datasets/hits_v1','{replica}') PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192;
创建分布式表
CREATE TABLE IF NOT EXISTS datasets.hits_all ON CLUSTER 'inst_f5040618_2shards_1replicas'AS datasets.hits_v1 ENGINE = Distributed( 'inst_f5040618_2shards_1replicas',datasets,hits_v1,rand());
导入数据
cat hits_v1.tsv | clickhouse-client -h A.A.A.A --port 8123 --user username --password yourPassword --query "INSERT INTO datasets.hits_all FORMAT TSV" --max_insert_block_size=100000;
统计数据量
localhost :) select count(*) from datasets.hits_all;
SELECT count(*)
FROM datasets.hits_all
┌─count()─┐
│ 8873898 │
└─────────┘
1 rows in set. Elapsed: 0.006 sec.
localhost :) select count(*) from datasets.hits_v1;
SELECT count(*)
FROM datasets.hits_v1
┌─count()─┐
│ 4435304 │
└─────────┘
1 rows in set. Elapsed: 0.002 sec.
条件查询示例
localhost :) SELECT count() FROM datasets.hits_all WHERE CounterID < 10000;
SELECT count()
FROM datasets.hits_all
WHERE CounterID < 10000
┌─count()─┐
│ 32670 │
└─────────┘
1 rows in set. Elapsed: 0.147 sec. Processed 73.73 thousand rows, 294.91 KB (499.99 thousand rows/s., 2.00 MB/s.)
localhost :) SELECT count() FROM datasets.hits_all WHERE CounterID > 10000;
SELECT count()
FROM datasets.hits_all
WHERE CounterID > 10000
┌─count()─┐
│ 8841228 │
└─────────┘
1 rows in set. Elapsed: 0.083 sec. Processed 8.87 million rows, 35.46 MB (106.47 million rows/s., 425.89 MB/s.)
从上面的操作结果可以看出云数据库ClickHouse在处理百万至千万数量级的条件查询操作处理速度非常快,能够满足实时查询的需求。