Linux记录-shell获取hdfs used使用

#!/bin/bash

export JAVA_HOME=xxx
export HADOOP_HOME=xxx
export HADOOP_CONF_DIR=xxx

cd /home/hdfs/xxx

mv host.txt host.txt_`date +"%Y-%m-%d"`
mv host.txt_`date +"%Y-%m-%d"` host/

mv hostdfs.txt hostdfs.txt_`date +"%Y-%m-%d"`
mv hostdfs.txt_`date +"%Y-%m-%d"` host/

rm -f ip.txt
rm -f allhostname.txt
rm -f hostname.txt
rm -f iphostname.txt
rm -f dfused.txt
rm -f minhost.txt
rm -f maxhost.txt
rm -f host.txt
rm -f nohup.out
rm -f dfsreport.txt

echo "start checking hdfs used rate"
$HADOOP_HOME/bin/hdfs dfsadmin -report > dfsreport.txt
cat dfsreport.txt | grep -i "50010" | awk -F ":" '{print $2}'| awk '{print $1}' > ip.txt
cat dfsreport.txt | grep -i "hostname" | awk -F ": " '{print $2}' > allhostname.txt
cat dfsreport.txt  | grep "DFS Used%" | awk -F ": " '{print $2}' | awk -F "%" '{print $1}' > dfused.txt
livesum=$(cat dfsreport.txt  | grep "Live datanodes" | awk '{print $3}' | awk -F "):" '{print $1}' | awk -F "(" '{print $2}')
echo $livesum
sed -i '1d' dfused.txt
let livesum+=1
sed -i $livesum',$d' dfused.txt 
sed -i $livesum',$d' ip.txt
linesum=$(cat ip.txt | wc -l)
echo $linesum
harr=($(awk '{print $1}' ip.txt))
darr=($(awk '{print $1}' dfused.txt)) 
if [ $linesum -gt 0 ]
  then
  for (( i = 0 ; i < ${#harr[@]} ; i++ ))
  do
     for (( j = 0 ; j < ${#darr[@]} ; j++ ))
     do
       if [ $i -eq $j ]
       then
         echo ${harr[$i]} ":" ${darr[$j]} >> hostdfs.txt
       fi
     done
 done
else
   echo "Not Live Datanodes"
fi
hharr=($(cat hostdfs.txt | awk '{print $3}' |sort -ru | tail -n 60)) 
ddarr=($(cat hostdfs.txt | awk '{print $3}' |sort -ru | head -n 100))
for (( k = 0; k< ${#hharr[@]} ; k++ ))
do
    if [[ $(echo $(cat hostdfs.txt) | grep "${hharr[$k]}") != "" ]]
    then
    cat hostdfs.txt | grep "${hharr[$k]}" | awk 'NR==1{print $1}' >> minhost.txt
    fi
done
for (( m = 0; m< ${#ddarr[@]} ; m++ ))
do
    if [[ $(echo $(cat hostdfs.txt) | grep "${ddarr[$m]}") != "" ]]
    then
    cat hostdfs.txt | grep "${ddarr[$m]}" | awk 'NR==1{print $1}' >> maxhost.txt
    fi
done

cat maxhost.txt | awk '{print $1}' >> host.txt
cat minhost.txt | awk '{print $1}' >> host.txt

narr=($(cat allhostname.txt | awk '{print $1}'))
for (( k = 0; k< ${#harr[@]}; k++ ))
do
  for (( n = 0; n < ${#narr[@]}; n++))
  do
    if [ $k -eq $n ]
    then
      echo ${harr[$k]} ":" ${narr[$n]} >>iphostname.txt
    fi
  done
done

hostarr=($(cat host.txt | awk '{print $1}'))
for (( c = 0; c < ${#hostarr[@]} ; c++ ))
do
    if [[ $(echo $(cat iphostname.txt) | grep "${hostarr[$c]}") != "" ]]
    then
      cat iphostname.txt | grep "${hostarr[$c]}" | awk 'NR==1{print $3}' >> hostname.txt
    fi
done

max_rate=$(cat dfused.txt | sort -r | head -n 1 | awk '{print int($0)}')
avg_used_rate=$(cat dfused.txt | awk '{e+=$1}END{print e/NR}' | awk '{print int($0)}' )
let max_avg_diff=max_rate-avg_used_rate

if [ $max_avg_diff -gt 5 ]
   then
   jps | grep -i "balancer"
   if [ $? -eq 0 ]
   then
      kill -9 $($JAVA_HOME/bin/jps | grep -i "balancer" | awk 'NR==1{print $1}')
   fi
   $HADOOP_HOME/bin/hdfs dfs -mv /system/balancer.id /system/balancer.id_`date +"%Y-%m-%d-%H-%M"` 
   nohup $HADOOP_HOME/bin/hdfs  balancer  -policy datanode -threshold 5 -include -f host.txt > /home/hdfs/balancer/rebalancer.log 2>&1 &
else
  echo "Nothing to do"
fi

$ nohup hdfs balancer
-Ddfs.datanode.balance.max.concurrent.moves = 10
-Ddfs.balancer.dispatcherThreads = 1024
-Ddfs.balance.bandwidthPerSec = 1073741824


#此配置用于限制允许Datanode平衡群集的最大并发块移动数
dfs.datanode.balance.max.concurrent.moves, default is 5
#带宽
dfs.datanode.balance.bandwidthPerSec, default is 1048576 (=1MB/s)
dfsadmin -setBalancerBandwidth <bandwidth in bytes per second>
#mover线程数
dfs.balancer.moverThreads, default is 1000
#datanode传输的最大线程数
dfs.datanode.max.transfer.threads
修改dfs.datanode.max.transfer.threads=4096 (如果运行HBase的话建议为16384),
指定用于在DataNode间传输block数据的最大线程数,老版本的对应参数为dfs.datanode.max.xcievers。
#平衡策略,默认为datanode
[-policy <policy>]
blockpool: Cluster is balanced if each pool in each node is balanced.
datanode: Cluster is balanced if each datanode is balanced.
#阈值
[-threshold <threshold>] [1.0, 100.0]
#包含列表
[-include [-f <hosts-file> | <comma-separated list of hosts>]]
#排除列表
[-exclude [-f <hosts-file> | <comma-separated list of hosts>]]
#最大移动数据大小
dfs.balancer.max-size-to-move, default is 10737418240 (=10GB)

原文地址:https://www.cnblogs.com/xinfang520/p/10316526.html