基于 bash find 命令执行 log 日志备份和清理的一次简单实践

前言

find 作为最基础的 bash shell 命令之一我就不多做介绍了，由于每月压缩后的日志增长量已经超过 20TB，和 PB 相比应该还不算很大，选择的日志归档同步方案是Facebook Scribe，之后或许会基于ELK或Grafana Loki搭建日志实时分析平台，不过眼下的问题还是想办法在没有商业化集中式存储和软件定义分布式存储的支持下，用比较简单粗暴的方法苦苦支撑，本文分享了一些简单的小技巧方便回顾和二次利用。

基于 bash find 命令执行 log 日志备份和清理日志简单实践

更新历史

2019 年 07 月 29 日 - 初稿

阅读原文 - https://wsgzao.github.io/post/find/

扩展阅读

find

find - search for files in a directory hierarchy

https://linux.die.net/man/1/find

find 命令用来在指定目录下查找文件。任何位于参数之前的字符串都将被视为欲查找的目录名。如果使用该命令时，不设置任何参数，则 find 命令将在当前目录下查找子目录与文件。并且将查找到的子目录和文件全部进行显示。

https://man.linuxde.net/find

How to use

log archive

SHELL=/bin/bash
PATH=/sbin:/bin:/usr/sbin:/usr/bin
MAILTO=root

# For details see man 4 crontabs

# Example of job definition:
# .---------------- minute (0 - 59)
# |  .------------- hour (0 - 23)
# |  |  .---------- day of month (1 - 31)
# |  |  |  .------- month (1 - 12) OR jan,feb,mar,apr ...
# |  |  |  |  .---- day of week (0 - 6) (Sunday=0 or 7) OR sun,mon,tue,wed,thu,fri,sat
# |  |  |  |  |
# *  *  *  *  * user-name  command to be executed

#zip
#15 3 * * * root cd /opt/sa_scripts/archive_from_date && bash run.sh   /data/gop/live/primary/gop 2
15 3 * * * root bash /opt/sa_scripts/archive_from_date/one_run_2d.sh

log backup and cleanup

cd /data/scripts
./check_original_log.sh 2019-05
./move_backup.sh 2019-05
./check_backup_log.sh 2019-05
nohup sh rsync_backup_10-71-12-61.sh > log/2019-04.rsync.out 2>&1 &
nohup sh rsync_backup_10-71-14-132.sh > log/2019-05.rsync.out 2>&1 &

log archive

archive_from_date.sh

#!/bin/bash

#to compress all files like:
#auth_data-2017-01-11_00000
#auth_data-2017-01-11_00001
#auth_data-2017-01-11_00002
#auth_data-2017-01-11_00003
#auth_data-2017-01-12_00000
#auth_data-2017-01-12_00001
#
#by day till the specified date

if [ "$#" -lt 2 ]
then
	echo "Usage: $0 path archive_days [trailing_digit]"
	exit 1
fi

FILE_PATH=$1
ARCHIVE_DAYS=$2
#seems no need traling digit param here
TRAILING_DIGIT=$3
if [ -z "${TRAILING_DIGIT//}" ]
then
	TRAILING_DIGIT=6
fi

start_date="2017-01-01"

end_date=`date`
end_date=`date -d -${ARCHIVE_DAYS}days +%Y-%m-%d`
echo "Scanning from $start_date to $end_date (not inclusive)"
echo "=================================="

if [[ "$end_date" < "$start_date" ]]
then
	echo "Invalid end date: $end_date,it should be later than start date: ${start_date}. Exit..."
	exit 1
fi

RESULT_PATH=/tmp/log_archive
if [ ! -d "$RESULT_PATH" ]
then
	mkdir -p $RESULT_PATH
fi


cd $FILE_PATH

while [ "$start_date" != "$end_date" ]
do
	if ls *${start_date}*[0-9] 1>/dev/null 2>&1
	then
		echo "Compressing: $start_date"
		first_file=`ls *${start_date}*[0-9]|head -1`
		file_name=${first_file%_*}
		tar cvzf ${file_name}.tgz ${file_name}*[0-9]

#		#Case 1: compare tar size with origin, uncomment if needed
#		original_size=`ls -l *${start_date}*[0-9]|awk '{sum+=$5} END {print sum}'`
#		weighted_size=$((original_size/20))
#		tar_size=`ls -l ${file_name}.tgz|awk '{print $5}'`
#		echo $tar_size,$weighted_size
#		if (( $tar_size < $weighted_size ))
#		then
#			echo "tar size: ${tar_size}; weighted size:${weighted_size}"
#			echo "tar size too small; not deleting origin"
#			echo "`pwd`: $file_name" >>/opt/sa_scripts/archive/result
#		else
#			echo "Done.Deleting origin."
#			rm ${file_name}*[0-9]
#		fi
#		#End of Case 1

		##############

		#Case 2, compare tar size with 0
		tar_size=`ls -l ${file_name}.tgz|awk '{print $5}'`
		if (( $tar_size > 0 ))
		then
			echo "Done.Deleting origin."
			rm ${file_name}*[0-9]
		else
			echo "tar file size is ZERO!"
			echo "`pwd`: $file_name" >>$RESULT_PATH
		fi
		#End of Case 2
	fi
	start_date=$(date -I -d "$start_date +1day")
done

archive_from_day.sh

#!/bin/bash
# bash one_day.sh /data/gop/live/primary/gop 2017-01-11

FILE_PATH=$1
ARCHIVE_DAYS=$2

start_date=$ARCHIVE_DAYS

end_date=$ARCHIVE_DAYS
echo "Scanning from $start_date to $end_date (not inclusive)"
echo "=================================="


RESULT_PATH=/tmp/log_archive
if [ ! -d "$RESULT_PATH" ]
then
	mkdir -p $RESULT_PATH
fi

cd $FILE_PATH

if ls *${start_date}*[0-9] 1>/dev/null 2>&1
then
	echo "Compressing: $start_date"
	first_file=`ls *${start_date}*[0-9]|head -1`
	file_name=${first_file%_*}
	tar cvzf ${file_name}.tgz ${file_name}*[0-9]
	#Case 2, compare tar size with 0
	tar_size=`ls -l ${file_name}.tgz|awk '{print $5}'`
	if (( $tar_size > 0 ))
	then
		echo "Done.Deleting origin."
		rm ${file_name}*[0-9]
	else
		echo "tar file size is ZERO!"
		echo "`pwd`: $file_name" >>$RESULT_PATH
	fi
	#End of Case 2
fi

run.sh

#!/bin/bash

PARENT_PATH=$1
ARCHIVE_DAY=$2

if [ "$#" -ne 2 ]
then
	echo "Usage: $0 parent_path archive_day"
	exit 1
fi

for folder in $(find $PARENT_PATH -type d -links 2)
do
	echo "processing:${folder}..."
	./archive_from_date.sh $folder $ARCHIVE_DAY
done

one_run_2d.sh

#!/bin/bash

end_date=`date -d -2days +%Y-%m-%d`

nohup bash /opt/sa_scripts/archive_from_date/one_run.sh /data/gop/live/primary/gop $end_date > /opt/sa_scripts/archive_from_date/logs/$end_date.log 2>&1 &

one_run.sh

#!/bin/bash

PARENT_PATH=$1
ARCHIVE_DAY=$2

if [ "$#" -ne 2 ]
then
	echo "Usage: $0 parent_path archive_day"
	exit 1
fi

for folder in $(find $PARENT_PATH -type d -links 2)
do
	echo "processing:${folder}..."
	/opt/sa_scripts/archive_from_date/day.sh $folder $ARCHIVE_DAY
done

backup and rsync scripts

check_original_log.sh

#!/bin/bash
var_date=$1
var_src="/data/gop/live/primary/gop"
find ${var_src} | grep -i ${var_date}

check_backup_log.sh


#!/bin/bash
var_date=$1
var_src="/data/backup/"
find ${var_src} | grep -i ${var_date}

move_backup.sh

#!/bin/bash
var_date=\$1
var_path="/data/gop/live/primary/gop"
var_str="/data/backup"

# analyze log

find ${var_path} | grep -i ${var_date} > log/${var_date}.ori.out
cp log/${var_date}.ori.out log/${var_date}.mod.out
sed -i "s:/data/gop/live/primary/gop:${var_str}:g" log/\${var_date}.mod.out

# merge move action

paste -d "|" log/${var_date}.ori.out log/${var_date}.mod.out > log/\${var_date}.out

# move files

for i in `cat log/${var_date}.out`
do
var_a=`echo ${i} | cut -f1 -d"|"`
var_b=`echo ${i} | cut -f2 -d"|"`
mv ${var_a} ${var_b}
done

rsync_backup.sh


#!/bin/bash
var_src="/data/backup/"
var_des="rsync://10.71.12.61:873/backup/"
rsync -aP ${var_src} ${var_des}

clean_backup_log.sh


#!/bin/bash
var_date=$1
var_src="/data/backup/"
find ${var_src} | grep -i ${var_date} | xargs rm -f

log cleanup exclude list

create python script to combine multiple lines

data=open("file").readlines()
for n,line in enumerate(data):
    if line.startswith("line"):
       data[n] = "\n"+line.rstrip()
    else:
       data[n]=line.rstrip()
print ('|'.join(data))

# simple example
Englist
中文

Englist|中文

# can change line.startwith
line1
text1
text2
text3
line2
something1
something2

line1|text1|text2|text3|
line2|something1|something2

use find to filter the log

# use find command filter directory
find /data/nc_backup -type d > log_source
find /data/nc_backup -type d | egrep -v "gop-live-tcp_server|gop-live-sso_website|gop-live-app_point|gop-live-gop_vk_data_server|gop-live-gop_huawei_data_server|gop-live-gop_line_data_server|gop-live-gop_google_data_server|gop-live-gop_fb_data_server|gop-live-gop_data_server|gop-live-general-api_server|gop-live-payment_center|gop-live-msdk_api|gop-live-cron_autofix|gop-live-shell_api_server|gop-live-api_server|gop-staging-payment_center|gop-staging-msdk_api" > log_target
# remove the fist line
sed -i '1d' log_source
sed -i '1d' log_target