Hive Word count

DROP TABLE myinput;

DROP TABLE wordcount;

CREATE TABLE myinput (line STRING);

-- Load the text from the local (Linux) filesystem. This should be changed to HDFS

-- for any serious usage

LOAD DATA LOCAL INPATH '/home/username/mytext.txt' INTO TABLE myinput;

-- Create a table with the words cleaned and counted.

-- The Java regex removes all punctuation and control characters.

CREATE TABLE wordcount AS

SELECT word, count(1) AS count

FROM (

SELECT EXPLODE(SPLIT(LOWER(REGEXP_REPLACE(line,'[\p{Punct},\p{Cntrl}]','')),' '))

AS word FROM myinput

) words

GROUP BY word

-- Sort the output by count with the highest counts first

ORDER BY count DESC, word ASC;

-- Make the output look like the output of the Pig DUMP function

-- so that we can diff this output with the Pig wordcount output

SELECT CONCAT_WS(',', CONCAT("(",word), CONCAT(count,")")) FROM wordcount;

--EXPLODE is a udtf function, used to convert each element in the array to a row.

Looking for a job working at Home about MSBI