postgres多知识点综合案例

1、使用regexp_split_to_table(text_industries, '#;#')将字符串切分为行

======================================

使用到的知识点：

1、使用with临时存储sql语句，格式【with as xxx(), as xxx2() 】以减少代码；

2、使用round()取小数点后几位；

3、使用to_char()将时间格式的数据转换为text型；

4、使用split_part(xx,xx2,xx3)函数对文本型数据进行切分；

5、使用group by之后利用count()进行统计；

6、join 以及 left join之间的区别；

7、使用join连接多个表，基本格式：【a join b on a.id = b.id join c on a.id = c.id】；

8、嵌套查询（select * from (select * from )）;

9、case xx when a then b else c end xx2：判断xx，如果满足a，赋值为b，否则赋值为c，最后取别名xx2；

10、使用current_date获取年月日：2021-01-28，使用now()获取当前时间戳，使用select to_char(now(),'YYYY')获取年；

11、使用【||】进行字符串的拼接；

12、使用to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' )将CURRENT_DATE 拼接时间后转时间戳；

13、使用【时间戳 + '-1 day'】进行时间戳的天数减一；

14、使用：【字段::类型】可以将字段转换为指定类型，或者使用【cast(字段 as 类型)】；

15、使用【insert into 表名(字段名1，字段名2) select * from 表名2 】将查询出来的值批量添加到另一个表中；

with tmp as (
select * from (
select 
d1.user_id,
d1.company_name,
d1.website_name,
d1.source_top,
round( 100 * d1.source_top / d2.news_num, 2 ) AS ratio,
row_number( ) OVER ( PARTITION BY d1.user_id, d1.company_name ORDER BY d1.source_top DESC) AS row_num
from
(SELECT
    t1.user_id,
    split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name,
    website_name AS website_name,
    count( website_name ) AS source_top 
FROM
    service.eoias_sentiment_analysis_result t1
    JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) 
WHERE
    t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' 
    AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) 
GROUP BY
    t1.user_id,
    company_name,
    website_name) d1
join
(SELECT
    user_id,
    company_name,
    count( company_name ) AS news_num 
FROM
    (
SELECT
    t1.user_id AS user_id,
    t1.case_id AS case_id,
    split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name,
website_name AS website_name,
CURRENT_DATE AS daily_date 
FROM
    service.eoias_sentiment_analysis_result t1
    JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) 
WHERE
    t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' 
    AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) and t1.user_id = t2.user_id and t1.case_id = cast( t2.id AS text )
    ) c1 
GROUP BY
    c1.user_id,
    company_name) d2
on d1.user_id = d2.user_id and d1.company_name = d2.company_name) e1 where row_num <=2
),

tmp2 as (
SELECT
    user_id,
    company_name,
    count( company_name ) AS news_num 
FROM
    (
SELECT
    t1.user_id AS user_id,
    t1.case_id AS case_id,
    split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name,
website_name AS website_name,
CURRENT_DATE AS daily_date 
FROM
    service.eoias_sentiment_analysis_result t1
    JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) 
WHERE
    t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' 
    AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) and t1.user_id = t2.user_id and t1.case_id = cast( t2.id AS text )
    ) c1 
GROUP BY
    c1.user_id,
    company_name
),

tmp3 as (
select user_id,company_name,sentiment_top1,sentiment_top1_num,sentiment_top1_ratio from (
SELECT
    c1.user_id,
    c1.company_name,
    c1.text_sentiment as sentiment_top1,
    c1.sentiment_top as sentiment_top1_num,
    round(100 * c1.sentiment_top / c2.news_num, 2) as sentiment_top1_ratio,
    row_number() over (partition by c1.user_id, c1.company_name) as rown 
FROM
    (
SELECT
    t1.user_id,
    split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name,
t1.text_sentiment,
count( 1 ) AS sentiment_top 
FROM
    service.eoias_sentiment_analysis_result t1
    JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) 
WHERE
    t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' 
    AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) 
GROUP BY
    t1.user_id,
    company_name,
    text_sentiment 
    ) c1
    JOIN (
SELECT
    user_id,
    company_name,
    count( company_name ) AS news_num 
FROM
    (
SELECT
    t1.user_id AS user_id,
    t1.case_id AS case_id,
    split_part ( t2.monitor_words_company, '#;#', 1 ) AS company_name,
website_name AS website_name,
CURRENT_DATE AS daily_date 
FROM
    service.eoias_sentiment_analysis_result t1
    JOIN service.eoias_crawler_key_param t2 ON t1.case_id = cast( t2.id AS text ) 
WHERE
    t1.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' 
    AND t1.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) 
    AND t1.user_id = t2.user_id 
    AND t1.case_id = cast( t2.id AS text ) 
    ) c1 
GROUP BY
    c1.user_id,
    company_name 
    ) c2 ON c1.user_id = c2.user_id 
    AND c1.company_name = c2.company_name) d1 where rown = '1'
)

insert into daily.eoias_daily_abstract(user_id,case_id,daily_date,company_name,news_num,source_top1,source_top1_num,source_top1_ratio,source_top2,source_top2_num,source_top2_ratio,sentiment_top1,sentiment_top1_num,sentiment_top1_ratio)
select 
    c.user_id,
    c.case_id,
    to_char(now()::timestamp,'YYYYmmdd') as daily_date,
    c.company_name,
    tmp2.news_num,
    tmp1.source_top1,
    tmp1.source_top1_num,
    tmp1.source_top1_ratio,
    tmp1.source_top2,
    tmp1.source_top2_num,
    tmp1.source_top2_ratio,
    tmp3.sentiment_top1,
    tmp3.sentiment_top1_num,
    tmp3.sentiment_top1_ratio
from (
SELECT
    a.user_id,
    a.case_id,
    split_part ( b.monitor_words_company, '#;#', 1 ) AS company_name 
FROM
    service.eoias_sentiment_analysis_result a
    JOIN service.eoias_crawler_key_param b ON a.case_id = cast( b.id AS text ) 
WHERE
    a.release_time >= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss' ) + '-1 day' 
    AND a.release_time <= to_timestamp ( CURRENT_DATE || ' ' || '07:00:00', 'yyyy-MM-dd hh24:mi:ss')) c 
    join (select 
    a.user_id,
    a.company_name, 
    a.website_name as source_top1, 
    a.source_top as source_top1_num,
    a.ratio as source_top1_ratio,
    case when b.website_name is null then '' else b.website_name end source_top2,
    case when b.source_top is null then 0 else b.source_top end source_top2_num,
    case when b.ratio is null then 0 else b.ratio end source_top2_ratio 
from
(select user_id, company_name, website_name, ratio, source_top from tmp where row_num = 1) a
left join
(select user_id, company_name, website_name, ratio, source_top from tmp where row_num = 2) b
on a.company_name = b.company_name and a.user_id = b.user_id) tmp1 on c.user_id = tmp1.user_id and  c.company_name = tmp1.company_name 
join tmp2 on c.user_id = tmp2.user_id and c.company_name = tmp2.company_name 
join tmp3 on c.user_id = tmp3.user_id and c.company_name = tmp3.company_name;

补充：

判断公司名称中是否包含相关字段来进行统一命名：

SELECT
CASE
    
WHEN
    cast( position( '腾讯' IN company_name ) AS boolean ) THEN
    '腾讯' 
        WHEN cast( position( '阿里' IN company_name ) AS boolean ) THEN
        '阿里巴巴' 
        WHEN cast( position( '中新赛克' IN company_name ) AS boolean ) THEN
        '中新赛克' ELSE company_name 
        END company_name 
FROM
daily.eoias_daily_website

从时间戳中提取月、日、时等

extract(Month from now()) || '月' || extract(Day from now()) || '日' || extract(Hour from now()) || '时'

提取一段时间内的每小时：

select generate_series ( '2021-02-24 07:00:00' :: TIMESTAMP, '2021-02-25 07:00:00' :: TIMESTAMP, '1 hour' ) AS "hour"