Example 1
#!/bin/bash
# file: check_order.sh
file="$1"
order="asc" # 排序类型:asc(升序)或desc(降序)
awk -F',' -v order="$order" '
BEGIN {
# 初始化哈希表,记录每个 key 的上一个 value 和当前有序状态
delete prev_value;
delete key_status; # 状态标记:1(有序)、0(无序)
}
{
if (NR == 1) {
next;
}
key = $1;
current_val = $2;
if (NR % 1000000 == 0) {
print "now processing " key ":" current_val;
}
# 若 key 首次出现,初始化状态为有序
if (!(key in prev_value)) {
prev_value[key] = current_val;
key_status[key] = 1; # 默认有序
next;
}
# 获取当前 key 的上一个 value
prev_val = prev_value[key];
# 数值类型判断与比较
if (current_val ~ /^-?[0-9]+(\.[0-9]+)?$/ && prev_val ~ /^-?[0-9]+(\.[0-9]+)?$/) {
num_curr = current_val + 0;
num_prev = prev_val + 0;
if ((order == "asc" && num_curr < num_prev) || (order == "desc" && num_curr > num_prev)) {
print "found bad seq: key=" key ",cur=" num_curr ",prev=" num_prev
key_status[key] = 0; # 标记为无序
}
if ((order == "asc" && num_curr != num_prev + 1) || (order == "desc" && num_curr + 1 != num_prev)) {
print "found gap seq: key=" key ",cur=" num_curr ",prev=" num_prev
}
} else {
print "should not hit, cur=" current_val ", prev=" prev_val
# 字符串字典序比较
if ((order == "asc" && current_val < prev_val) || (order == "desc" && current_val > prev_val)) {
key_status[key] = 0; # 标记为无序
}
}
# 更新当前 key 的上一个 value
prev_value[key] = current_val;
}
END {
# 输出所有 key 的有序状态
for (key in key_status) {
status = (key_status[key] == 1) ? "有序" : "无序";
print key ":" status;
}
}
' "$file"
使用方式:
bash check_order.sh xxx.csv
在处理大文件(10G+)时特别有用。
Example 2
csv中,第一列是股票代码,第二列是分钟线时间。下面这段脚本检测,对于每一个股票代码,他的分钟线时间都是逐分钟递增,检测是否存在跳变的情况。
function diff_min(hhmm_1, hhmm_2) {
if (hhmm_1 == "13:01" && hhmm_2 == "11:30") return 1;
if (hhmm_1 == "15:00" && hhmm_2 == "15:00") return 1;
split(hhmm_1, arr, ":")
a = arr[1]*60 + arr[2]
split(hhmm_2, arr, ":")
b = arr[1]*60 + arr[2]
return a - b
}
BEGIN {
delete prev_min;
delete key_status;
}
{
if (NR == 1) next;
key = $1
min = substr($2, 1, 5)
if (!(key in prev_min)) {
prev_min[key] = min
key_status[key] = 1 # ok
next
}
pmin = prev_min[key]
if (diff_min(min, pmin) != 1) {
key_status[key] = pmin
}
prev_min[key] = min
}
END {
for (key in key_status) {
status = (key_status[key] == 1) ? "ok" : key_status[key]
print key ": " status
}
}
Example 3
csv中,第一列为股票代码,第二列为分钟线时间。对于相同股票代码,可能有重复的分钟线。需要过滤掉重复的但保留最后一条。
{
lines[NR] = $0
if (NR == 1) next;
key = $1 "_" $2
if (key in seen) {
dup_lines[seen[key]] = 1
}
seen[key] = NR
}
END {
for (i = 1; i <= NR; i++) {
line = lines[i]
split(line, arr, ",")
key = arr[1] "_" arr[2]
if (!(i in dup_lines) || i == seen[key]) {
print line
}
}
}
Example 4
csv中,最后一列为数据序列号。check序列号逐一递增。
BEGIN {
flag = 1
}
{
if (NR == 1) next;
cur_val = $(NF) + 0
if (prev_val ~ /^-?[0-9]+(\.[0-9]+)?$/) {
if (cur_val != prev_val + 1) {
print "Line " NR " not continuous, prev:" prev_val ", cur:" cur_val
flag = 0
exit 2
}
}
prev_val = cur_val
}
END {
print flag ? "passed" : "failed"
}
inline example
# 打印第2列为689009且第1列匹配10:57:24的行
awk -F, '$2==689009 && $1~"10:57:24" {print $0}' mdl_4_4_0.csv
awk -F, '$2==689009 && $1 ~ /10:57:24/ {print $0}' mdl_4_4_0.csv
# 打印第一列为920088且倒数第二列匹配10:31:0x的行,print $0 <=> print
awk -F',' '$1=="920088" && $(NF-1) ~ /10:31:0/ {print}' mdl_14_8_0.csv
# 反向匹配(不包含相应模式),操作符换为 "!~"
awk -F',' '$1=="920088" && $(NF-1) !~ /10:31:0/ {print}' mdl_14_8_0.csv