#!/usr/bin/env bash # This script export a multi-lanbguage RTF export from memoQ to a gettext .po if [[ "$#" -ne 1 ]]; then echo "Please pass exactly 1 parameter: the rtf to convert" exit 1 fi ORIGINAL_RTF="$1" WORKING_RTF="$1.work" PO_DIR="$(pwd)/" TEMP_DIR="$(mktemp -d)" PO=$(basename "${ORIGINAL_RTF%%.txt_fre.rtf}.po") TEMP_PO="${TEMP_DIR}/${PO}" PO="${PO_DIR}/${PO}" cat "${ORIGINAL_RTF}" > "${WORKING_RTF}" dos2unix "${WORKING_RTF}" &> /dev/null #Remove all unused lines sed -i '/brdrcf17/d' "${WORKING_RTF}" sed -i '/ltrpar/d' "${WORKING_RTF}" sed -i '/fs16/d' "${WORKING_RTF}" sed -i '/\\row/d' "${WORKING_RTF}" sed -i 's/\\\\//g' "${WORKING_RTF}" sed -i 's/{\\rtlch\\fcs1\ \\ltrch\\fcs0//g' "${WORKING_RTF}" sed -i 's/\\noproof \\cell }//g' "${WORKING_RTF}" sed -i 's/\\cell }//g' "${WORKING_RTF}" sed -i 's/{\\plain\\noproof\ \\cs99\\f0\\fs20\\cf13\ \\{MQ\\}}//g' "${WORKING_RTF}" #Remove useless translations of EVENT, NODE, STORY sed -i '/\lang1036 -- \[EVENT\]/d' "${WORKING_RTF}" sed -i '/\lang1036 +\[NODE\]/d' "${WORKING_RTF}" sed -i '/\lang1036 \[STORY\]/d' "${WORKING_RTF}" sed -i '/\lang1036 \[\/EVENT\]/d' "${WORKING_RTF}" sed -i '/\lang1036 \[\/NODE\]/d' "${WORKING_RTF}" sed -i '/\lang1036 \[\/STORY\]/d' "${WORKING_RTF}" # Remove headers sed -i '1,23d' "${WORKING_RTF}" # Quick function to deal with \uc0\uXXX char in rtf format, as iconv and other tool cannot detect them... # In addition to that, rtf adds a useless space after each encoded char that need to be corrected manually function convert_char_to_utf8 { # Convert all unbreakable spaces in the line temp_line="${line//'\~'/' '}" i=$(echo "${temp_line}" | grep "uc0") while [[ "$i" != "" ]] do # Extract the utf16 code eg: \uc0\u171 -> 171 temp=$(echo "${temp_line#*'\uc0\u'}") code=$(echo "${temp%%' '*}") # Convert the code to hexa eg: 171 -> 0x00ab char=$(printf "0x%04x" ${code}) # Convert to unicode signal (first 0x should be \u) eg: 0x00ab -> \u00ab (substitution) -> « (echo -ne) # |tr -d '\0' remove a warning introduced in bash 4.4, see https://lists.gnu.org/archive/html/bug-bash/2016-09/msg00015.html char=$(echo -ne "${char/0x/'\u'}" |tr -d '\0') # Insert the new char in the line temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"' /'"$char"'/g') # In case the char is end of line, insert with no space temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"'/'"$char"'/g') i=$(echo "${temp_line}" | grep "uc0") done echo ${temp_line} } comment_event="" comment_node="" comment_story="" comment_out="" msgstr="" msgid="" out_incr=1 story_first_line=0 while read -r line; do if [[ ${line} == *"-- [EVENT]"* ]]; then comment_event="${line}" elif [[ ${line} == *"+[NODE]"* ]]; then comment_node="${line}" elif [[ ${line} == *"[STORY]"* ]]; then comment_story="${comment_event}@@${comment_node}@@${line}" # Clean strings comment_story="${comment_story//'\noproof '/}" comment_story="${comment_story//'\cell }'/}" # Time for new story, write the comment to pot echo "#. ${comment_story}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" echo >> "${TEMP_PO}" story_first_line=1 elif [[ ${line} == *"[/STORY]"* ]]; then # we reach end of story, insert the translated msgstr and reinit msgstr="${msgstr//'\lang1036 '/}" msgstr="${msgstr//'\cell }'/}" # If there is still lang1036 then it means the msgstr is untranslated, insert an empty translation if [[ ${msgstr} == *"lang1036"* ]]; then echo "msgstr \"\"" >> "${TEMP_PO}" echo >> "${TEMP_PO}" else echo "msgstr \"\"${msgstr}" >> "${TEMP_PO}" echo >> "${TEMP_PO}" fi comment_story="" msgstr="" elif [[ ${line} == *"\noproof [OUT]"* ]]; then # It's an out for english part (with noproof) so write the comment_out comment_out="${comment_event}@@${comment_node}@@[OUT]${out_incr}" comment_out="${comment_out//'\noproof '/}" comment_out="${comment_out//'\cell }'/}" # OUT are one line, let's write it to the po echo "#. ${comment_out}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" echo >> "${TEMP_PO}" line="${line//'\cell }'/}" line="${line//'\noproof '/}" # Get rid of special character line="${line//'"'/'\"'}" echo "msgid \"${line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" echo >> "${TEMP_PO}" # Insert an empty msgstr in case of untranslated [OUT] # (in this case it never matches the \lang1036 [OUT] below so we need to insert it outside) echo "msgstr \"\"" >> "${TEMP_PO}" echo >> "${TEMP_PO}" ((out_incr++)) elif [[ ${line} == *"\lang1036 [OUT]"* ]]; then # It's an out for translated part (with lang1036) so write the msgstr. line="${line//'\cell }'/}" line="${line//'\lang1036 '/}" # Get rid of special character line="${line//'"'/'\"'}" encoded_line=$(convert_char_to_utf8 "${line}") # Insert without the msgstr as it has been inserted with the msgid before. echo "\"${encoded_line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" echo >> "${TEMP_PO}" elif [[ ${line} == *"[/NODE]"* ]]; then # We reach end of node, reinit comment_node="" comment_story="" comment_out="" out_incr=1 elif [[ ${line} == *"[/EVENT]"* ]]; then # We reach end of event, reinit comment_event="" comment_node="" comment_story="" comment_out="" out_incr=1 elif [[ "${comment_story}" != "" && ${story_first_line} == 1 ]]; then # As developer sometimes forgot empty lines at the end of STORY, skip empty lines if [[ "${line}" = "" ]]; then continue fi # If the line has nothing particular, and the comment_story is not empty, it's a story string # As the story_fist_line is set, it's the first line of the story, insert the msgid line="${line//'\cell }'/}" # Get rid of special character line="${line//'"'/'\"'}" echo "${line//'\noproof '/'msgid "'}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" echo >> "${TEMP_PO}" story_first_line=0 elif [[ ${comment_story} != "" ]]; then # As developer sometimes forgot empty lines at the end of STORY, skip empty lines if [[ "${line}" = "" ]]; then continue fi # If it contains lang1036 it's a translated line, keep it for later if [[ $(echo "${line}" | grep "lang1036") ]]; then # Get rid of special character line="${line//'"'/'\"'}" encoded_line=$(convert_char_to_utf8 "${line}") # We need a newline between each translated lines msgstr="${msgstr} \"${encoded_line}\n\"" continue fi # Also as this is multiline, we need to add \n to the previous line (no way to detect it sooner...) sed -e '$s/\(.*\)"$/\1\\n"/' -i "${TEMP_PO}" line="${line//'\cell }'/}" line="${line//'\noproof '/}" # Get rid of special character line="${line//'"'/'\"'}" echo "\"${line}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" echo >> "${TEMP_PO}" else # It should be a newline, write a newline # If new case are added later, we need to deal with it manually, do not break the file by writing them echo "" >> "${TEMP_PO}" fi done < "${WORKING_RTF}" # Insert default po header echo 'msgid "" msgstr "" "MIME-Version: 1.0\n" "Content-Transfer-Encoding: 8bit\n" "Content-Type: text/plain; charset=UTF-8\n" "Project-Id-Version: \n" "POT-Creation-Date: \n" "PO-Revision-Date: \n" "Last-Translator: \n" "Language-Team: \n"' | cat - "${TEMP_PO}" > "${TEMP_PO}.tmp" && mv "${TEMP_PO}.tmp" "${TEMP_PO}" # Unify duplicate due to pot syntax msguniq --no-wrap "${TEMP_PO}" > "${PO}"