|
|
- #!/usr/bin/env bash
-
- # This script export a multi-lanbguage RTF export from memoQ to a gettext .po
-
- if [[ "$#" -ne 1 ]]; then
- echo "Please pass exactly 1 parameter: the rtf to convert"
- exit 1
- fi
-
- ORIGINAL_RTF="$1"
- WORKING_RTF="$1.work"
- PO_DIR="$(pwd)/"
- TEMP_DIR="$(mktemp -d)"
- PO=$(basename "${ORIGINAL_RTF%%.txt_fre.rtf}.po")
- TEMP_PO="${TEMP_DIR}/${PO}"
- PO="${PO_DIR}/${PO}"
-
- cat "${ORIGINAL_RTF}" > "${WORKING_RTF}"
-
- dos2unix "${WORKING_RTF}" &> /dev/null
-
- #Remove all unused lines
- sed -i '/brdrcf17/d' "${WORKING_RTF}"
- sed -i '/ltrpar/d' "${WORKING_RTF}"
- sed -i '/fs16/d' "${WORKING_RTF}"
- sed -i '/\\row/d' "${WORKING_RTF}"
- sed -i 's/\\\\//g' "${WORKING_RTF}"
- sed -i 's/{\\rtlch\\fcs1\ \\ltrch\\fcs0//g' "${WORKING_RTF}"
- sed -i 's/\\noproof \\cell }//g' "${WORKING_RTF}"
- sed -i 's/\\cell }//g' "${WORKING_RTF}"
- sed -i 's/{\\plain\\noproof\ \\cs99\\f0\\fs20\\cf13\ \\{MQ\\}}//g' "${WORKING_RTF}"
- #Remove useless translations of EVENT, NODE, STORY
- sed -i '/\lang1036 -- \[EVENT\]/d' "${WORKING_RTF}"
- sed -i '/\lang1036 +\[NODE\]/d' "${WORKING_RTF}"
- sed -i '/\lang1036 \[STORY\]/d' "${WORKING_RTF}"
- sed -i '/\lang1036 \[\/EVENT\]/d' "${WORKING_RTF}"
- sed -i '/\lang1036 \[\/NODE\]/d' "${WORKING_RTF}"
- sed -i '/\lang1036 \[\/STORY\]/d' "${WORKING_RTF}"
- # Remove headers
- sed -i '1,23d' "${WORKING_RTF}"
-
- # Quick function to deal with \uc0\uXXX char in rtf format, as iconv and other tool cannot detect them...
- # In addition to that, rtf adds a useless space after each encoded char that need to be corrected manually
- function convert_char_to_utf8 {
- # Convert all unbreakable spaces in the line
- temp_line="${line//'\~'/' '}"
- i=$(echo "${temp_line}" | grep "uc0")
- while [[ "$i" != "" ]]
- do
- # Extract the utf16 code eg: \uc0\u171 -> 171
- temp=$(echo "${temp_line#*'\uc0\u'}")
- code=$(echo "${temp%%' '*}")
- # Convert the code to hexa eg: 171 -> 0x00ab
- char=$(printf "0x%04x" ${code})
- # Convert to unicode signal (first 0x should be \u) eg: 0x00ab -> \u00ab (substitution) -> « (echo -ne)
- # |tr -d '\0' remove a warning introduced in bash 4.4, see https://lists.gnu.org/archive/html/bug-bash/2016-09/msg00015.html
- char=$(echo -ne "${char/0x/'\u'}" |tr -d '\0')
- # Insert the new char in the line
- temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"' /'"$char"'/g')
- # In case the char is end of line, insert with no space
- temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"'/'"$char"'/g')
- i=$(echo "${temp_line}" | grep "uc0")
- done
-
- echo ${temp_line}
- }
-
- comment_event=""
- comment_node=""
- comment_story=""
- comment_out=""
- msgstr=""
- msgid=""
- out_incr=1
- story_first_line=0
- while read -r line; do
- if [[ ${line} == *"-- [EVENT]"* ]]; then
- comment_event="${line}"
- elif [[ ${line} == *"+[NODE]"* ]]; then
- comment_node="${line}"
- elif [[ ${line} == *"[STORY]"* ]]; then
- comment_story="${comment_event}@@${comment_node}@@${line}"
- # Clean strings
- comment_story="${comment_story//'\noproof '/}"
- comment_story="${comment_story//'\cell }'/}"
- # Time for new story, write the comment to pot
- echo "#. ${comment_story}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- story_first_line=1
- elif [[ ${line} == *"[/STORY]"* ]]; then
- # we reach end of story, insert the translated msgstr and reinit
- msgstr="${msgstr//'\lang1036 '/}"
- msgstr="${msgstr//'\cell }'/}"
- # If there is still lang1036 then it means the msgstr is untranslated, insert an empty translation
- if [[ ${msgstr} == *"lang1036"* ]]; then
- echo "msgstr \"\"" >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- else
- echo "msgstr \"\"${msgstr}" >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- fi
- comment_story=""
- msgstr=""
- elif [[ ${line} == *"\noproof [OUT]"* ]]; then
- # It's an out for english part (with noproof) so write the comment_out
- comment_out="${comment_event}@@${comment_node}@@[OUT]${out_incr}"
- comment_out="${comment_out//'\noproof '/}"
- comment_out="${comment_out//'\cell }'/}"
- # OUT are one line, let's write it to the po
- echo "#. ${comment_out}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- line="${line//'\cell }'/}"
- line="${line//'\noproof '/}"
- # Get rid of special character
- line="${line//'"'/'\"'}"
- echo "msgid \"${line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- # Insert an empty msgstr in case of untranslated [OUT]
- # (in this case it never matches the \lang1036 [OUT] below so we need to insert it outside)
- echo "msgstr \"\"" >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- ((out_incr++))
- elif [[ ${line} == *"\lang1036 [OUT]"* ]]; then
- # It's an out for translated part (with lang1036) so write the msgstr.
- line="${line//'\cell }'/}"
- line="${line//'\lang1036 '/}"
- # Get rid of special character
- line="${line//'"'/'\"'}"
- encoded_line=$(convert_char_to_utf8 "${line}")
- # Insert without the msgstr as it has been inserted with the msgid before.
- echo "\"${encoded_line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- elif [[ ${line} == *"[/NODE]"* ]]; then
- # We reach end of node, reinit
- comment_node=""
- comment_story=""
- comment_out=""
- out_incr=1
- elif [[ ${line} == *"[/EVENT]"* ]]; then
- # We reach end of event, reinit
- comment_event=""
- comment_node=""
- comment_story=""
- comment_out=""
- out_incr=1
- elif [[ "${comment_story}" != "" && ${story_first_line} == 1 ]]; then
- # As developer sometimes forgot empty lines at the end of STORY, skip empty lines
- if [[ "${line}" = "" ]]; then
- continue
- fi
- # If the line has nothing particular, and the comment_story is not empty, it's a story string
- # As the story_fist_line is set, it's the first line of the story, insert the msgid
- line="${line//'\cell }'/}"
- # Get rid of special character
- line="${line//'"'/'\"'}"
- echo "${line//'\noproof '/'msgid "'}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- story_first_line=0
- elif [[ ${comment_story} != "" ]]; then
- # As developer sometimes forgot empty lines at the end of STORY, skip empty lines
- if [[ "${line}" = "" ]]; then
- continue
- fi
- # If it contains lang1036 it's a translated line, keep it for later
- if [[ $(echo "${line}" | grep "lang1036") ]]; then
- # Get rid of special character
- line="${line//'"'/'\"'}"
- encoded_line=$(convert_char_to_utf8 "${line}")
- # We need a newline between each translated lines
- msgstr="${msgstr}
- \"${encoded_line}\n\""
- continue
- fi
- # Also as this is multiline, we need to add \n to the previous line (no way to detect it sooner...)
- sed -e '$s/\(.*\)"$/\1\\n"/' -i "${TEMP_PO}"
- line="${line//'\cell }'/}"
- line="${line//'\noproof '/}"
- # Get rid of special character
- line="${line//'"'/'\"'}"
- echo "\"${line}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
- echo >> "${TEMP_PO}"
- else
- # It should be a newline, write a newline
- # If new case are added later, we need to deal with it manually, do not break the file by writing them
- echo "" >> "${TEMP_PO}"
- fi
-
- done < "${WORKING_RTF}"
-
- # Insert default po header
- echo 'msgid ""
- msgstr ""
- "MIME-Version: 1.0\n"
- "Content-Transfer-Encoding: 8bit\n"
- "Content-Type: text/plain; charset=UTF-8\n"
- "Project-Id-Version: \n"
- "POT-Creation-Date: \n"
- "PO-Revision-Date: \n"
- "Last-Translator: \n"
- "Language-Team: \n"' | cat - "${TEMP_PO}" > "${TEMP_PO}.tmp" && mv "${TEMP_PO}.tmp" "${TEMP_PO}"
-
- # Unify duplicate due to pot syntax
- msguniq --no-wrap "${TEMP_PO}" > "${PO}"
|