DROP TABLE IF EXISTS access_logs; CREATE EXTERNAL TABLE access_logs ( remote_addr STRING comment 'client IP', time_local STRING comment 'access time', request STRING comment 'request URL', status STRING comment 'HTTP status', body_bytes_sent STRING comment 'size of response body', http_referer STRING comment 'referer', http_cookie STRING comment 'cookies', remote_user STRING comment 'client name', http_user_agent STRING comment 'client browser info', request_time STRING comment 'consumed time of handling request', host STRING comment 'server host', msec STRING comment 'consumed time of writing logs' ) COMMENT 'web access logs' ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' WITH SERDEPROPERTIES ( "input.regex" = "([0-9\\.]+) - \\[([^\\]]+)\\] \"([^\"]*)\" ([\\d]+) ([\\d]*) \"([^\"]*)\" \"([^\"]*)\" ([\\S]+) \"([^\"]*)\" ([0-9\\.]+) ([\\S]+) ([0-9\\.]+)" ) STORED AS TEXTFILE LOCATION "${INPUT}"; INSERT OVERWRITE DIRECTORY '${OUTPUT}' SELECT hour(from_unixtime(unix_timestamp(time_local, 'dd/MMMM/yyyy:HH:mm:ss Z'))) as hour, count(1) as pv FROM access_logs GROUP BY hour(from_unixtime(unix_timestamp(time_local, 'dd/MMMM/yyyy:HH:mm:ss Z'))) ORDER BY pv DESC limit 5;