Hive Word count

--https://github.com/slimandslam/pig-hive-wordcount/blob/master/wordcount.hql

DROP TABLE myinput;

DROP TABLE wordcount;

CREATE TABLE myinput (line STRING);

   

-- Load the text from the local (Linux) filesystem. This should be changed to HDFS

-- for any serious usage

LOAD DATA LOCAL INPATH '/home/username/mytext.txt' INTO TABLE myinput;

   

-- Create a table with the words cleaned and counted.

-- The Java regex removes all punctuation and control characters.

---reference http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html

CREATE TABLE wordcount AS

SELECT word, count(1) AS count

FROM (

SELECT EXPLODE(SPLIT(LOWER(REGEXP_REPLACE(line,'[\p{Punct},\p{Cntrl}]','')),' '))

AS word FROM myinput

) words

GROUP BY word

-- Sort the output by count with the highest counts first

ORDER BY count DESC, word ASC;

   

-- Make the output look like the output of the Pig DUMP function

-- so that we can diff this output with the Pig wordcount output

SELECT CONCAT_WS(',', CONCAT("(",word), CONCAT(count,")")) FROM wordcount;

   

--EXPLODE is a udtf function, used to convert each element in the array to a row.

   

Looking for a job working at Home about MSBI
原文地址:https://www.cnblogs.com/huaxiaoyao/p/4347040.html