ambevar-dotfiles/.scripts/tc-text-2utf8

52 lines
1.1 KiB
Bash
Executable File

#!/bin/sh
if [ $# -eq 0 ] || [ "$1" = "-h" ]; then
cat<<EOF
Usage: ${0##*/} FOLDERS
Convert all 'bad' encoding to UTF-8/LF.
WARNING: It will fail for encodings other the one explicitly supported below.
EOF
fi
if ! command -v recode >/dev/null 2>&1; then
echo "recode needed."
exit
fi
for i ; do
while IFS= read -r j; do
CODING=$(file "$j")
case "$CODING" in
*ISO-8859*)
echo "ISO-8859: [$j]"
recode latin1..utf-8 "$j";;
*'Non-ISO extended-ASCII'*)
echo "cp1252: [$j]"
recode cp1252..utf-8 "$j";;
*'UTF-16 Unicode text'*)
echo "UTF-16: [$j]"
recode utf-16..utf-8 "$j";;
*'UTF-8 Unicode (with BOM)'*)
echo "UTF-8 BOM: [$j]"
ex -sc '1s/^.//|xit' "$j";;
## The following commands are funny alternatives, but are completely
## overkill.
# dd iflag=skip_bytes skip=3 if=file.srt of=temp.srt
# dd bs=1 skip=3 if=file.srt of=temp.srt
# tail -c +32 file.srt > temp.srt
esac
if echo "$CODING" | grep -q 'CRLF'; then
echo "CRLF: [$j]"
ex -sc '%s/
//g|xit' "$j"
fi
done <<EOF
$(find "$i" -type f -size -50M -print)
EOF
done