ambevar-dotfiles/.scripts/tc-text-2utf8

#!/bin/sh

usage () {
	cat <<EOF>&2
Usage: ${0##*/} FOLDERS

Convert all 'bad' encoding to UTF-8/LF.

WARNING: It will fail with encodings other the one explicitly supported in the script code.

EOF
}

[ $# -eq 0 ] && usage && exit 1
[ "$1" = "-h" ] && usage && exit
[ "$1" = "--" ] && shift

if ! command -v recode >/dev/null 2>&1; then
	echo >&2 "recode needed."
	exit 1
fi

for i ; do
	while IFS= read -r j; do
		CODING=$(file "$j")

		case "$CODING" in
		*ISO-8859*)
			echo "ISO-8859:   [$j]"
			recode latin1..utf-8 "$j";;
		*'Non-ISO extended-ASCII'*)
			echo "cp1252:     [$j]"
			recode cp1252..utf-8 "$j";;
		*'UTF-16 Unicode text'*)
			echo "UTF-16:     [$j]"
			recode utf-16..utf-8 "$j";;
		*'UTF-8 Unicode (with BOM)'*)
			echo "UTF-8 BOM:  [$j]"
			ex -sc '1s/^.//|xit' "$j";;
			## The following commands are funny alternatives, but are completely
			## overkill.
			# dd iflag=skip_bytes skip=3 if=file.srt of=temp.srt
			# dd bs=1 skip=3 if=file.srt of=temp.srt
			# tail -c +32 file.srt > temp.srt
		esac

		if echo "$CODING" | grep -q 'CRLF'; then
			echo "CRLF:       [$j]"
			ex -sc '%s/
//g|xit' "$j"
		fi

	done <<EOF
$(find "$i" -type f -size -50M -print)
EOF
done