Linux awk学习

零、awk标准语法

[root@wohaoshuai1 bbb]# echo "abcd" |awk 'BEGIN{print "wohaoshuai"} /a/ {print length($0)} END{print "hehe"}'
wohaoshuai
4
hehe
[root@wohaoshuai1 bbb]# echo "abcd" |awk 'BEGIN{print "wohaoshuai"} /bb/ {print length($0)} END{print "hehe"}'
wohaoshuai
hehe
[root@wohaoshuai1 bbb]# echo "abcd" |awk 'BEGIN{print "wohaoshuai"} /b/ {print length($0)} END{print "hehe"}'
wohaoshuai
4
hehe

一、awk基本操作

　　1、打印文件中信息

[root@wohaoshuai1 bbb]# awk '{print}' 1.txt 
abaca        dafdsaf         23    dsfsfsdf
dsafbe        dfsdfds         34    dadsgr4edxzgvcxiksjkl
fsbsfads      ejfklsdnmfkldsj        45    ds;lfekopmkldnvksda
sfdksnme3    dsfsknfkl353426        56    4t5t435
sf3ew53wefsty    dsf3e5t345        67    3e435fdsf6

　　2、打印文件中第一列信息

[root@wohaoshuai1 bbb]# awk '{print $1}' 1.txt 
abaca
dsafbe
fsbsfads
sfdksnme3
sf3ew53wefsty

　　3、打印第一列第三列，中间使用制表符隔开

[root@wohaoshuai1 bbb]# awk '{print $1 "	" $3}' 1.txt 
abaca    23
dsafbe    34
fsbsfads    45
sfdksnme3    56
sf3ew53wefsty    67

　　4、添加行号(NR)

[root@wohaoshuai1 bbb]# awk '{print NR "	" $1 "	" $2 "	" $3}' 1.txt 
1    abaca    dafdsaf    23
2    dsafbe    dfsdfds    34
3    fsbsfads    ejfklsdnmfkldsj    45
4    sfdksnme3    dsfsknfkl353426    56
5    sf3ew53wefsty    dsf3e5t345    67

　　5、打印每一行有多少列(NF)

[root@wohaoshuai1 bbb]# awk '{print NF "	" $0}' 1.txt 
4    abaca        dafdsaf         23    dsfsfsdf
4    dsafbe        dfsdfds         34    dadsgr4edxzgvcxiksjkl
4    fsbsfads      ejfklsdnmfkldsj        45    ds;lfekopmkldnvksda
6    sfdksnme3    dsfsknfkl353426        56    4t5t435 esf aaa
5    sf3ew53wefsty    dsf3e5t345        67    3e435fdsf6 eee

　　6、打印出第三列等于67的行

[root@wohaoshuai1 bbb]# awk '$3==67 {print $0}' 1.txt 
dsafbe        dfsdfds         67    dadsgr4edxzgvcxiksjkl
sf3ew53wefsty    dsf3e5t345        67    3e435fdsf6 eee
sf3ew53wefsty   dsf3e5t345              67      3e435fdsf6 ees

　　7、打印出第一列等于abaca的行

[root@wohaoshuai1 bbb]# awk '$1=="abaca" {print $0}' 1.txt 
abaca        dafdsaf         23    dsfsfsdf

　　8、如何把一行竖排的数据转换成横排？

[root@wohaoshuai1 bbb]# cat 3.txt
abc"
"xxabc
sfasabc
"xxabcxx"    abc
"a bc"        bcd
abcsdfdsfeabc    dsfdabc
abcdefg
ab c
a b c
a c
adc
bbb
a.c
[root@wohaoshuai1 bbb]# awk '{printf("%s,",$0)}' 3.txt
abc","xxabc,sfasabc,"xxabcxx"    abc,"a bc"        bcd,abcsdfdsfeabc    dsfdabc,abcdefg,ab c,a b c,a c,adc,bbb,a.c,[root@wohaoshuai1 bbb]#

　　9、大哥操作

grep "Dec/2018" pyplatform_access.log|awk '{a[$1]++}END{for(i in a){print a[i]" "i}}'|sort -nr |head -10

二、awk内部变量

　　1、NF 竖行数量

当此行有4列时就打印此行
[root@wohaoshuai1 bbb]# awk 'NF==4 {print NF "	" $0}' 1.txt 
4    abaca        dafdsaf         23    dsfsfsdf
4    dsafbe        dfsdfds         67    dadsgr4edxzgvcxiksjkl
4    fsbsfads      ejfklsdnmfkldsj        45    ds;lfekopmkldnvksda

　　2、NR 横行数量

当NR为4，即第四行才打印
[root@wohaoshuai1 bbb]# awk 'NR==4 {print NR "	" $0}' 1.txt 
4    sfdksnme3    dsfsknfkl353426        56    4t5t435 esf aaa

　　3、FS 定义当前输入所用的分隔符

[root@wohaoshuai1 bbb]# awk 'BEGIN{FS=","} {print $1 "	" $2}'
1 2 4
1 2 4    
1,2,3,4
1    2
12,23,34
12    23

　　4、OFS 输出分隔符

[root@wohaoshuai1 bbb]# awk 'BEGIN{OFS=","} {print $1,$2}'   #相当于$1和$2之间的逗号由开始默认的空格变为了, 
1 2 3 4 5
1,2
1 2 3 45 45
1,2

将输入分隔符定义为,输出分隔符定义为,
[root@wohaoshuai1 bbb]# awk 'BEGIN{OFS=",";FS=","} {print $1,$2}'
1 2 3 4
1 2 3 4,
1,2,3,4
1,2

　　5、FILENAME 文件名

同时统计两个文件
[root@wohaoshuai1 bbb]# awk '{print NR "	" $0}' 1.txt 2.txt 
1    abaca        dafdsaf         23    dsfsfsdf
2    dsafbe        dfsdfds         67    dadsgr4edxzgvcxiksjkl
3    fsbsfads      ejfklsdnmfkldsj        45    ds;lfekopmkldnvksda
4    sfdksnme3    dsfsknfkl353426        56    4t5t435 esf aaa
5    sf3ew53wefsty    dsf3e5t345        67    3e435fdsf6 eee
6    sf3ew53wefsty   dsf3e5t345              67      3e435fdsf6 ees
7    sffbbb hello
8    aekjfdk slkfjdsklfnk dlskfjej
9    123 345 dskfj345 123556
10    123df dgfg fedkgfj45 skdllfe980
11    

输出行号，文件名和相应信息
[root@wohaoshuai1 bbb]# awk '{print NR "	" FILENAME "	" $0}' 1.txt 2.txt 
1    1.txt    abaca        dafdsaf         23    dsfsfsdf
2    1.txt    dsafbe        dfsdfds         67    dadsgr4edxzgvcxiksjkl
3    1.txt    fsbsfads      ejfklsdnmfkldsj        45    ds;lfekopmkldnvksda
4    1.txt    sfdksnme3    dsfsknfkl353426        56    4t5t435 esf aaa
5    1.txt    sf3ew53wefsty    dsf3e5t345        67    3e435fdsf6 eee
6    1.txt    sf3ew53wefsty   dsf3e5t345              67      3e435fdsf6 ees
7    2.txt    sffbbb hello
8    2.txt    aekjfdk slkfjdsklfnk dlskfjej
9    2.txt    123 345 dskfj345 123556
10    2.txt    123df dgfg fedkgfj45 skdllfe980
11    2.txt

　　6、给列赋值

方法1、
[root@wohaoshuai1 bbb]# awk '{$3="xxx";print $0}' 1.txt 
abaca dafdsaf xxx dsfsfsdf
dsafbe dfsdfds xxx dadsgr4edxzgvcxiksjkl
fsbsfads ejfklsdnmfkldsj xxx ds;lfekopmkldnvksda
sfdksnme3 dsfsknfkl353426 xxx 4t5t435 esf aaa
sf3ew53wefsty dsf3e5t345 xxx 3e435fdsf6 eee
sf3ew53wefsty dsf3e5t345 xxx 3e435fdsf6 ees
方法2、
[root@wohaoshuai1 bbb]# awk '$3="xxx" {print $0}' 1.txt 
abaca dafdsaf xxx dsfsfsdf
dsafbe dfsdfds xxx dadsgr4edxzgvcxiksjkl
fsbsfads ejfklsdnmfkldsj xxx ds;lfekopmkldnvksda
sfdksnme3 dsfsknfkl353426 xxx 4t5t435 esf aaa
sf3ew53wefsty dsf3e5t345 xxx 3e435fdsf6 eee
sf3ew53wefsty dsf3e5t345 xxx 3e435fdsf6 ees

　　7、打印倒数第一列和倒数第二列

[root@wohaoshuai1 bbb]# awk '{print $NF}' 1.txt 
dsfsfsdf
dadsgr4edxzgvcxiksjkl
ds;lfekopmkldnvksda
aaa
eee
ees
[root@wohaoshuai1 bbb]# awk '{print $(NF-1)}' 1.txt 
23
67
45
esf
3e435fdsf6
3e435fdsf6

　　8、获取外部变量

　　　　a、获得普通外部变量

[root@361way ~]# test='awk test code'
[root@361way ~]# echo | awk  '{print test}' test="$test"
awk test code
注：awk '{action}' 变量名=变量值，这样传入变量，可以在action中获得值。 注意：变量名与值放到'{action}'后面。

　　　　b、BEGIN程序块中变量

[root@361way ~]# test='awk Begin test'
[root@361way ~]# echo | awk -v test="$test" 'BEGIN{print test}'
awk Begin test
[root@361way ~]# echo | awk -v test="$test" '{print test}'
awk Begin test

[root@wohaoshuai1 bbb]# echo |awk -v test="$test" -v test2="test2" 'BEGIN {print test,test2}'
awk test code test2

使用：awk –v 变量名=变量值 [–v 变量2=值2 …] 'BEGIN{action}'格式时，用-v 传入变量可以在BEGIN、END或省略3种类型的action 中都可以获得到变量值，但顺序在action前面。而且最前面的echo管道不能少。

三、awk运算

　　1、在awk中会自动根据实际情况判断定义的变量是字符串还是数字

[root@wohaoshuai1 bbb]# awk '{a=1;b=2; print a + b}' #也支持加减乘除
3

[root@wohaoshuai1 bbb]# awk '{a=1;b=2; print a b}' #此处表示将ab两个字符串拼接起来
12

[root@wohaoshuai1 bbb]# awk '{a=1;b=2; print a,b}' #此处表示将ab两个字符串输出出来
1 2

四、正则表达式

　　1、/abc/ 如果字符串中包含abc三个字符那么表示符合正则表达式内容

[root@wohaoshuai1 bbb]# cat 3.txt 
"abc"
"xxabc"
"xxabcxx"
"a bc"
ab c
a b c
bbb
[root@wohaoshuai1 bbb]# awk '/abc/ {print $0}' 3.txt 
"abc"
"xxabc"
"xxabcxx"

　　2、/a.c/ #中间的点表示任何一个字符

[root@wohaoshuai1 bbb]# cat 3.txt 
"abc"
"xxabc"
"xxabcxx"
"a bc"
ab c
a b c
a c
adc
bbb
[root@wohaoshuai1 bbb]# awk '/a.c/ {print $0}' 3.txt 
"abc"
"xxabc"
"xxabcxx"
a c
adc

　　3、转义

[root@wohaoshuai1 bbb]# cat 3.txt 
"abc"
"xxabc"
"xxabcxx"
"a bc"
ab c
a b c
a c
adc
bbb
a.c
[root@wohaoshuai1 bbb]# awk '/a.c/ {print $0}' 3.txt 
a.c

　　4、^ 和 $

匹配以abc开头的字符
/^abc/
[root@wohaoshuai1 bbb]# cat 3.txt 
abc"
"xxabc"
"xxabcxx"
"a bc"
abcdefg
ab c
a b c
a c
adc
bbb
a.c
[root@wohaoshuai1 bbb]# awk '/^abc/ {print $0}' 3.txt 
abc"
abcdefg

匹配以abc结尾的字符
/abc$/
[root@wohaoshuai1 bbb]# cat 3.txt 
abc"
"xxabc
sfasabc
"xxabcxx"
"a bc"
sdfdsfeabc
abcdefg
ab c
a b c
a c
adc
bbb
a.c
[root@wohaoshuai1 bbb]# awk '/abc$/ {print $0}' 3.txt 
"xxabc
sfasabc
sdfdsfeabc

　　5、[] #匹配方括号中的任意一个

/a[xyz]c/ #可以匹配到axc,ayc,azc

中间出现一个小写或大写的字符
/a[a-zA-Z]c/

^第二个意思
/a[^a-z]c/ 写在中括号前面表示不匹配，此处表示a和c中间匹配不是a-z的字符

　　6、*表示匹配零到多次，+ 表示匹配一次到多次

    /a*b/ #表示匹配0到多个a
    b
    ab
    aab
    /a+b/ #表示匹配一到多个a
    ab
    aab
    aaab

　　7、? #跟在一个字母后面表示这个字母可以出现也可以不出现

    /a?b/  #表示a可以有也可以没有,因此可以匹配 b 或者 ab

　　8、{}

    {3} #表示前面的字母必须出现三次   #/ab{3}c/
    {3,5} #表示前面的字母可以出现3-5次    #/ab{3,5}c/
    {3,}    #表示前面的字母可以出现3到多个    #/ab{3,}c/

　　9、()

    /(ab)+c/ #表示ab可以反复出现，如abc,ababc,abababc

五、awk高级用法

　　1、awk赋值运算，赋值语句运算符：= += -= *= /= %= ^= **

例如：a+=5;等价于a=a+5

# awk 'BEGIN{a=5;a+=5;print a}'
10

　　2、awk正则运算

输出包含有root的行，并打印用户名和UID及原行内容

[root@localhost ~]# awk -F: '/root/ {print $1,$3,$0}' /etc/passwd
root 0 root:x:0:0:root:/root:/bin/bash
operator 11 operator:x:11:0:operator:/root:/sbin/nologin

我们发现找到了两行，如果我们想找root开头的行就要这样写：

awk -F: '/^root/' /etc/passwd

　　3、awk三目运算

# awk 'BEGIN{a="b";print a=="b"?"ok":"err"}'
ok
# awk 'BEGIN{a="b";print a=="c"?"ok":"err"}'
err
三目运算其实就是一个判断运算，如果为真则输出？后的内容，如果为假则输出：后的内容

　　4、awk的循环运用

　　　　a、if语句运用

[root@localhost ~]# awk 'BEGIN{ test=100;if(test>90){ print "very good";} else{print "no pass";}}'
verygood
每条命令后用；结尾

　　　　b、while循环运用计算从1累加到100的值

[root@wohaoshuai1 bbb]# awk 'BEGIN{test=100;num=0; while(i<=test) {num+=i;i++};{print num}}'
5050

　　　　c、for循环的运用

[root@wohaoshuai1 bbb]# awk 'BEGIN{test=0; for (i=0;i<=100;i++) {test+=i};{print test}}'
5050

　　　　d、循环数组

[root@wohaoshuai1 bbb]# a='a b c'
[root@wohaoshuai1 bbb]# awk -v a="$a" 'BEGIN{split(a,str," ");for(i in str) {print str[i]}}'
a
b
c

　　　　e、do循环的运用

[root@wohaoshuai1 bbb]# awk 'BEGIN{test=0;i=1;do {test+=i;i++} while (i<=100) {print test}}'
5050

　　5、awk的数组运用

　　　　a、数组是awk的灵魂，处理文本中最不能少的就是它的数组处理,因为数组索引（下标）可以是数字和字符串。
　　　　b、在awk中数组叫做关联数组(associative arrays)。
　　　　c、awk 中的数组不必提前声明，也不必声明大小。
　　　　d、数组元素用0或空字符串来初始化，这根据上下文而定。
　　　　e、一般而言，awk中的数组用来从记录中收集信息，可以用于计算总和、统计单词以及跟踪模板被匹配的次数等等。

[root@wohaoshuai1 bbb]# awk -F: 'BEGIN {count=0;} {name[count] = $1;count++;}; END{for (i = 0; i < NR; i++) print i, name[i]}' /etc/passwd 
0 root
1 bin
2 daemon
3 adm
4 lp
5 sync
6 shutdown
7 halt
8 mail
9 operator
10 games
11 ftp
12 nobody
13 avahi-autoipd
14 systemd-bus-proxy
15 systemd-network
16 dbus
17 polkitd
18 apache
19 abrt
20 libstoragemgmt
21 tss

六、awk字符串函数的运用

　　1、sub，匹配记录中最大、最靠左边的子字符串的正则表达式，并用替换字符串替换这些字符串。如果没有指定目标字符串就默认使用整个记录。替换只发生在第一次匹配的时候.

[root@wohaoshuai1 bbb]# awk '{ sub(/abc/, "mytest"); print $0}' 3.txt
mytest"
"xxmytest
sfasmytest
"xxmytestxx"    abc
"a bc"        bcd
mytestsdfdsfeabc    dsfdabc
mytestdefg
ab c
a b c
a c
adc
bbb
a.c
[root@wohaoshuai1 bbb]# awk '{ sub(/abc/, "mytest",$2); print $0}' 3.txt
abc"
"xxabc
sfasabc
"xxabcxx" mytest
"a bc"        bcd
abcsdfdsfeabc dsfdmytest
abcdefg
ab c
a b c
a c
adc
bbb
a.c

　　　　a、上面第一个例子在整个记录中匹配，替换只发生在第一次匹配发生的时候。如要在整个文件中进行匹配需要用到gsub

　　　　b、上面第二个例子在整个记录的第二个域中进行匹配，替换只发生在第一次匹配发生的时候。

　　2、gsub，整个文档中进行匹配

[root@wohaoshuai1 bbb]# awk '{ gsub(/abc/, "mytest"); print $0}' 3.txt
mytest"
"xxmytest
sfasmytest
"xxmytestxx"    mytest
"a bc"        bcd
mytestsdfdsfemytest    dsfdmytest
mytestdefg
ab c
a b c
a c
adc
bbb
a.c
[root@wohaoshuai1 bbb]# awk '{ gsub(/abc/, "mytest",$2); print $0}' 3.txt
abc"
"xxabc
sfasabc
"xxabcxx" mytest
"a bc"        bcd
abcsdfdsfeabc dsfdmytest
abcdefg
ab c
a b c
a c
adc
bbb
a.c

　　　　a、第一个例子在整个文档中匹配test，匹配的都被替换成mytest。

　　　　b、第二个例子在整个文档的第一个域中匹配，所有匹配的都被替换成mytest。

　　3、index，返回子字符串第一次被匹配的位置，偏移量从位置1开始 index(string, substring)

　　4、substr，返回从位置1开始的子字符串，如果指定长度超过实际长度，就返回整个字符串

[root@wohaoshuai1 bbb]# awk '{ print substr( $1, 1,7 ) }' 3.txt
abc"
"xxabc
sfasabc
"xxabcx
"a
abcsdfd
abcdefg
ab
a
a
adc
bbb
a.c
[root@wohaoshuai1 bbb]# awk '{ print substr( $1, 1,11) }' 3.txt
abc"
"xxabc
sfasabc
"xxabcxx"
"a
abcsdfdsfea
abcdefg
ab
a
a
adc
bbb
a.c

　　　　上面例子中分别为打印每行1,7个字符和1,11个字符

　　5、split，可按给定的分隔符把字符串分割为一个数组。如果分隔符没提供，则按当前FS值进行分割 split( string, array, field separator ) split( string, array )

[root@wohaoshuai1 bbb]# awk -v a="20:18:00" 'BEGIN{ split( a, time, ":" ); for(i in time) {print time[i]} }'
20
18
00

　　6、length，返回记录的字符数 length( string )，length( list/dict )，length

[root@wohaoshuai1 bbb]# awk 'BEGIN{ print length( "test" ) }'
4
[root@wohaoshuai1 bbb]# awk '{print length}' 3.txt
4
6
7
13
11
21
7
4
5
3
3
3
3

　　　　a、第一个实例返回test字符串的长度。

　　　　b、第二个实例返回testfile文件中第条记录的字符数。

　　6、match，返回在字符串中正则表达式位置的索引，如果找不到指定的正则表达式则返回0。match函数会设置内建变量RSTART为字符串中子字符串的开始位置，RLENGTH为到子字符串末尾的字符个数。substr可利于这些变量来截取字符串 match( string, regular expression )

[root@wohaoshuai1 bbb]# awk 'BEGIN{start=match("this is a test",/[a-z]+$/); print start}'
11
[root@wohaoshuai1 bbb]# awk 'BEGIN{start=match("this is a test",/[a-z]+$/); print start, RSTART, RLENGTH }'
11 11 4

　　　　a、第一个实例打印以连续小写字符结尾的开始位置，这里是11。

　　　　b、第二个实例还打印RSTART和RLENGTH变量，这里是11(start)，11(RSTART)，4(RLENGTH)。

　　7、toupper和tolower，可用于字符串大小间的转换，该功能只在gawk中有效 toupper( string ) tolower( string )

[root@wohaoshuai1 bbb]# awk 'BEGIN{ print toupper("test"), tolower("TEST") }'
TEST test