OpenSource/preprocess at main · ryan8614/OpenSource · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env bash

# ------------------------------------------------------------------------------
# Author: <Ryan Chang>
# Script Name: preprocess
# Description: Preprocess a raw CSV file by cleaning up formatting issues.
#              This includes convert the semicolon separator to the  <tab> character, Convert
#              the Microsoft line endings to Unix line endings, Change format of floating-point numbers.
#              The cleaned output is printed to stdout.
# Usage: ./preprocess <file>
# ------------------------------------------------------------------------------

# Check the number of arguments
# If the number is not 1, print error message
if [[ $# != 1 ]]
then
    echo "Error: The number of arguments does not match" >&2
    exit 1
fi

file="$1"
sep=";"

# Check file existence and permissions
if [[ ! -f "$file" ]]; then
    echo "Error: File '$file' is not found." >&2
    exit 1
fi
if [[ ! -r "$file" ]]; then
    echo "Error: File '$file' is not readable." >&2
    exit 1
fi

# Create a temp file to store the cleaned file
cleaned_file=$(mktemp)

# Data cleaning
# 1. Convert the semicolon separator to the  <tab> character (replace separator with '\t')
# 2. Convert the Microsoft line endings to Unix line endings (remove '\r')
# 3. Deal with non-ASCII characters by deleting them from the output.
# 4. Change format of floating-point numbers to use '.' rather than ',' as the decimal point.
sed -e "s/$sep/$(printf '\t')/g" < "$file" \
| tr -d '\r' \
| tr -cd '\11\12\15\40-\176\n' \
| gawk -F '\t' -v OFS='\t' '
    NR == 1 { print; next }
    {
        for (i = 1; i <= NF; i++) {
            val = $i
            if (i == 9 || i == 11) {
                $i = ""
                for (j = 1; j <= length(val); j++) {
                    c = substr(val, j, 1)
                    $i = $i ((c == ",") ? "." : c)
                }
            }
        }
        print
    }' > "$cleaned_file"

# Find the max ID number
maxid=$(
    gawk -F '\t' '
    NR > 1 && $1 ~ /^[0-9]+$/ {
        id = int($1)
        if (id > maxid) maxid = id
    }
    END {
        print maxid
    }
    ' "$cleaned_file"
)

# Check if the max ID is greater than 0
if [[ $maxid -le 0 ]]; then
    echo "Error: The max ID number is not greater than 0." >&2
    exit 1
fi

# Filling the missing ID
# Assume that ID is the first column
gawk -F '\t' -v OFS='\t' -v maxid="$maxid" '
NR == 1 { print; next }
{
    if ($1 ~ /^[ \t]*$/) {
        $1 = ++maxid
    }
    print
}
' "$cleaned_file"

# Remove the cleaned temporary file
rm -f "$cleaned_file"

# Exit successfully
exit 0