LecygneNoir
/
thea2Shattering_i18n_tools


								#!/usr/bin/env bash


								# This script export a multi-lanbguage RTF export from memoQ to a gettext .po


								if [[ "$#" -ne 1 ]]; then

								    echo "Please pass exactly 1 parameter: the rtf to convert"

								    exit 1

								fi


								ORIGINAL_RTF="$1"

								WORKING_RTF="$1.work"

								PO_DIR="$(pwd)/"

								TEMP_DIR="$(mktemp -d)"

								PO=$(basename "${ORIGINAL_RTF%%.txt_fre.rtf}.po")

								TEMP_PO="${TEMP_DIR}/${PO}"

								PO="${PO_DIR}/${PO}"


								cat "${ORIGINAL_RTF}" > "${WORKING_RTF}"


								dos2unix "${WORKING_RTF}" &> /dev/null


								#Remove all unused lines

								sed -i '/brdrcf17/d' "${WORKING_RTF}"

								sed -i '/ltrpar/d' "${WORKING_RTF}"

								sed -i '/fs16/d' "${WORKING_RTF}"

								sed -i '/\\row/d' "${WORKING_RTF}"

								sed -i  's/\\\\//g' "${WORKING_RTF}"

								sed -i  's/{\\rtlch\\fcs1\ \\ltrch\\fcs0//g' "${WORKING_RTF}"

								sed -i  's/\\noproof \\cell }//g' "${WORKING_RTF}"

								sed -i  's/\\cell }//g' "${WORKING_RTF}"

								sed -i 's/{\\plain\\noproof\ \\cs99\\f0\\fs20\\cf13\ \\{MQ\\}}//g' "${WORKING_RTF}"

								#Remove useless translations of EVENT, NODE, STORY

								sed -i '/\lang1036 -- \[EVENT\]/d' "${WORKING_RTF}"

								sed -i '/\lang1036 +\[NODE\]/d' "${WORKING_RTF}"

								sed -i '/\lang1036 \[STORY\]/d' "${WORKING_RTF}"

								sed -i '/\lang1036 \[\/EVENT\]/d' "${WORKING_RTF}"

								sed -i '/\lang1036 \[\/NODE\]/d' "${WORKING_RTF}"

								sed -i '/\lang1036 \[\/STORY\]/d' "${WORKING_RTF}"

								# Remove headers

								sed -i  '1,23d' "${WORKING_RTF}"


								# Quick function to deal with \uc0\uXXX char in rtf format, as iconv and other tool cannot detect them...

								# In addition to that, rtf adds a useless space after each encoded char that need to be corrected manually

								function convert_char_to_utf8 {

								    # Convert all unbreakable spaces in the line

								    temp_line="${line//'\~'/' '}"

								    i=$(echo "${temp_line}" | grep "uc0")

								    while [[ "$i" != "" ]]

								    do

								        # Extract the utf16 code eg: \uc0\u171 -> 171

								        temp=$(echo "${temp_line#*'\uc0\u'}")

								        code=$(echo "${temp%%' '*}")

								        # Convert the code to hexa eg: 171 -> 0x00ab

								        char=$(printf "0x%04x" ${code})

								        # Convert to unicode signal (first 0x should be \u) eg: 0x00ab -> \u00ab (substitution) -> « (echo -ne)

								        # |tr -d '\0' remove a warning introduced in bash 4.4, see https://lists.gnu.org/archive/html/bug-bash/2016-09/msg00015.html

								        char=$(echo -ne "${char/0x/'\u'}" |tr -d '\0')

								        # Insert the new char in the line

								        temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"' /'"$char"'/g')

								        # In case the char is end of line, insert with no space

								        temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"'/'"$char"'/g')

								        i=$(echo "${temp_line}" | grep "uc0")

								    done


								    echo ${temp_line}

								}


								comment_event=""

								comment_node=""

								comment_story=""

								comment_out=""

								msgstr=""

								msgid=""

								out_incr=1

								story_first_line=0

								while read -r line; do

								    if  [[ ${line} == *"-- [EVENT]"* ]]; then

								        comment_event="${line}"

								    elif [[ ${line} == *"+[NODE]"* ]]; then

								        comment_node="${line}"

								    elif [[ ${line} == *"[STORY]"* ]]; then

								        comment_story="${comment_event}@@${comment_node}@@${line}"

								        # Clean strings

								        comment_story="${comment_story//'\noproof '/}"

								        comment_story="${comment_story//'\cell }'/}"

								        # Time for new story, write the comment to pot

								        echo "#. ${comment_story}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"

								        echo >> "${TEMP_PO}"

								        story_first_line=1

								    elif [[ ${line} == *"[/STORY]"* ]]; then

								        # we reach end of story, insert the translated msgstr and reinit

								        msgstr="${msgstr//'\lang1036 '/}"

								        msgstr="${msgstr//'\cell }'/}"

								        # If there is still lang1036 then it means the msgstr is untranslated, insert an empty translation

								        if [[ ${msgstr} == *"lang1036"* ]]; then

								            echo "msgstr \"\"" >> "${TEMP_PO}"

								            echo >> "${TEMP_PO}"

								        else

								            echo "msgstr \"\"${msgstr}" >> "${TEMP_PO}"

								            echo >> "${TEMP_PO}"

								        fi

								        comment_story=""

								        msgstr=""

								    elif [[ ${line} == *"\noproof [OUT]"* ]]; then

								        # It's an out for english part (with noproof) so write the comment_out

								        comment_out="${comment_event}@@${comment_node}@@[OUT]${out_incr}"

								        comment_out="${comment_out//'\noproof '/}"

								        comment_out="${comment_out//'\cell }'/}"

								        # OUT are one line, let's write it to the po

								        echo "#. ${comment_out}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"

								        echo >> "${TEMP_PO}"

								        line="${line//'\cell }'/}"

								        line="${line//'\noproof '/}"

								        # Get rid of special character

								        line="${line//'"'/'\"'}"

								        echo "msgid \"${line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"

								        echo >> "${TEMP_PO}"

								        # Insert an empty msgstr in case of untranslated [OUT]

								        # (in this case it never matches the \lang1036 [OUT] below so we need to insert it outside)

								        echo "msgstr \"\"" >> "${TEMP_PO}"

								        echo >> "${TEMP_PO}"

								        ((out_incr++))

								    elif [[ ${line} == *"\lang1036 [OUT]"* ]]; then

								        # It's an out for translated part (with lang1036) so write the msgstr.

								        line="${line//'\cell }'/}"

								        line="${line//'\lang1036 '/}"

								        # Get rid of special character

								        line="${line//'"'/'\"'}"

								        encoded_line=$(convert_char_to_utf8 "${line}")

								        # Insert without the msgstr as it has been inserted with the msgid before.

								        echo "\"${encoded_line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"

								        echo >> "${TEMP_PO}"

								    elif [[ ${line} == *"[/NODE]"* ]]; then

								        # We reach end of node, reinit

								        comment_node=""

								        comment_story=""

								        comment_out=""

								        out_incr=1

								    elif [[ ${line} == *"[/EVENT]"* ]]; then

								        # We reach end of event, reinit

								        comment_event=""

								        comment_node=""

								        comment_story=""

								        comment_out=""

								        out_incr=1

								    elif [[ "${comment_story}" != "" && ${story_first_line} == 1 ]]; then

								        # As developer sometimes forgot empty lines at the end of STORY, skip empty lines

								        if [[ "${line}" = "" ]]; then

								            continue

								        fi

								        # If the line has nothing particular, and the comment_story is not empty, it's a story string

								        # As the story_fist_line is set, it's the first line of the story, insert the msgid

								        line="${line//'\cell }'/}"

								        # Get rid of special character

								        line="${line//'"'/'\"'}"

								        echo "${line//'\noproof '/'msgid "'}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"

								        echo >> "${TEMP_PO}"

								        story_first_line=0

								    elif [[ ${comment_story} != "" ]]; then

								        # As developer sometimes forgot empty lines at the end of STORY, skip empty lines

								        if [[ "${line}" = "" ]]; then

								            continue

								        fi

								        # If it contains lang1036 it's a translated line, keep it for later

								        if [[ $(echo "${line}" | grep "lang1036") ]]; then

								            # Get rid of special character

								            line="${line//'"'/'\"'}"

								            encoded_line=$(convert_char_to_utf8 "${line}")

								            # We need a newline between each translated lines

								            msgstr="${msgstr}

								\"${encoded_line}\n\""

								            continue

								        fi

								        # Also as this is multiline, we need to add \n to the previous line (no way to detect it sooner...)

								        sed -e '$s/\(.*\)"$/\1\\n"/' -i "${TEMP_PO}"

								        line="${line//'\cell }'/}"

								        line="${line//'\noproof '/}"

								        # Get rid of special character

								        line="${line//'"'/'\"'}"

								        echo "\"${line}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"

								        echo >> "${TEMP_PO}"

								    else

								        # It should be a newline, write a newline

								        # If new case are added later, we need to deal with it manually, do not break the file by writing them

								        echo "" >> "${TEMP_PO}"

								    fi


								done < "${WORKING_RTF}"


								# Insert default po header

								echo 'msgid ""

								msgstr ""

								"MIME-Version: 1.0\n"

								"Content-Transfer-Encoding: 8bit\n"

								"Content-Type: text/plain; charset=UTF-8\n"

								"Project-Id-Version: \n"

								"POT-Creation-Date: \n"

								"PO-Revision-Date: \n"

								"Last-Translator: \n"

								"Language-Team: \n"' | cat - "${TEMP_PO}" > "${TEMP_PO}.tmp" && mv "${TEMP_PO}.tmp" "${TEMP_PO}"


								# Unify duplicate due to pot syntax

								msguniq --no-wrap "${TEMP_PO}" > "${PO}"