-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess
More file actions
executable file
·96 lines (85 loc) · 2.53 KB
/
Copy pathpreprocess
File metadata and controls
executable file
·96 lines (85 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env bash
# ------------------------------------------------------------------------------
# Author: <Ryan Chang>
# Script Name: preprocess
# Description: Preprocess a raw CSV file by cleaning up formatting issues.
# This includes convert the semicolon separator to the <tab> character, Convert
# the Microsoft line endings to Unix line endings, Change format of floating-point numbers.
# The cleaned output is printed to stdout.
# Usage: ./preprocess <file>
# ------------------------------------------------------------------------------
# Check the number of arguments
# If the number is not 1, print error message
if [[ $# != 1 ]]
then
echo "Error: The number of arguments does not match" >&2
exit 1
fi
file="$1"
sep=";"
# Check file existence and permissions
if [[ ! -f "$file" ]]; then
echo "Error: File '$file' is not found." >&2
exit 1
fi
if [[ ! -r "$file" ]]; then
echo "Error: File '$file' is not readable." >&2
exit 1
fi
# Create a temp file to store the cleaned file
cleaned_file=$(mktemp)
# Data cleaning
# 1. Convert the semicolon separator to the <tab> character (replace separator with '\t')
# 2. Convert the Microsoft line endings to Unix line endings (remove '\r')
# 3. Deal with non-ASCII characters by deleting them from the output.
# 4. Change format of floating-point numbers to use '.' rather than ',' as the decimal point.
sed -e "s/$sep/$(printf '\t')/g" < "$file" \
| tr -d '\r' \
| tr -cd '\11\12\15\40-\176\n' \
| gawk -F '\t' -v OFS='\t' '
NR == 1 { print; next }
{
for (i = 1; i <= NF; i++) {
val = $i
if (i == 9 || i == 11) {
$i = ""
for (j = 1; j <= length(val); j++) {
c = substr(val, j, 1)
$i = $i ((c == ",") ? "." : c)
}
}
}
print
}' > "$cleaned_file"
# Find the max ID number
maxid=$(
gawk -F '\t' '
NR > 1 && $1 ~ /^[0-9]+$/ {
id = int($1)
if (id > maxid) maxid = id
}
END {
print maxid
}
' "$cleaned_file"
)
# Check if the max ID is greater than 0
if [[ $maxid -le 0 ]]; then
echo "Error: The max ID number is not greater than 0." >&2
exit 1
fi
# Filling the missing ID
# Assume that ID is the first column
gawk -F '\t' -v OFS='\t' -v maxid="$maxid" '
NR == 1 { print; next }
{
if ($1 ~ /^[ \t]*$/) {
$1 = ++maxid
}
print
}
' "$cleaned_file"
# Remove the cleaned temporary file
rm -f "$cleaned_file"
# Exit successfully
exit 0