Scripting tools to interact with Thea 2 The Shattering files in order to translate them easily.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

202 lines
8.0 KiB

  1. #!/usr/bin/env bash
  2. # This script export a multi-lanbguage RTF export from memoQ to a gettext .po
  3. if [[ "$#" -ne 1 ]]; then
  4. echo "Please pass exactly 1 parameter: the rtf to convert"
  5. exit 1
  6. fi
  7. ORIGINAL_RTF="$1"
  8. WORKING_RTF="$1.work"
  9. PO_DIR="$(pwd)/"
  10. TEMP_DIR="$(mktemp -d)"
  11. PO=$(basename "${ORIGINAL_RTF%%.txt_fre.rtf}.po")
  12. TEMP_PO="${TEMP_DIR}/${PO}"
  13. PO="${PO_DIR}/${PO}"
  14. cat "${ORIGINAL_RTF}" > "${WORKING_RTF}"
  15. dos2unix "${WORKING_RTF}" &> /dev/null
  16. #Remove all unused lines
  17. sed -i '/brdrcf17/d' "${WORKING_RTF}"
  18. sed -i '/ltrpar/d' "${WORKING_RTF}"
  19. sed -i '/fs16/d' "${WORKING_RTF}"
  20. sed -i '/\\row/d' "${WORKING_RTF}"
  21. sed -i 's/\\\\//g' "${WORKING_RTF}"
  22. sed -i 's/{\\rtlch\\fcs1\ \\ltrch\\fcs0//g' "${WORKING_RTF}"
  23. sed -i 's/\\noproof \\cell }//g' "${WORKING_RTF}"
  24. sed -i 's/\\cell }//g' "${WORKING_RTF}"
  25. sed -i 's/{\\plain\\noproof\ \\cs99\\f0\\fs20\\cf13\ \\{MQ\\}}//g' "${WORKING_RTF}"
  26. #Remove useless translations of EVENT, NODE, STORY
  27. sed -i '/\lang1036 -- \[EVENT\]/d' "${WORKING_RTF}"
  28. sed -i '/\lang1036 +\[NODE\]/d' "${WORKING_RTF}"
  29. sed -i '/\lang1036 \[STORY\]/d' "${WORKING_RTF}"
  30. sed -i '/\lang1036 \[\/EVENT\]/d' "${WORKING_RTF}"
  31. sed -i '/\lang1036 \[\/NODE\]/d' "${WORKING_RTF}"
  32. sed -i '/\lang1036 \[\/STORY\]/d' "${WORKING_RTF}"
  33. # Remove headers
  34. sed -i '1,23d' "${WORKING_RTF}"
  35. # Quick function to deal with \uc0\uXXX char in rtf format, as iconv and other tool cannot detect them...
  36. # In addition to that, rtf adds a useless space after each encoded char that need to be corrected manually
  37. function convert_char_to_utf8 {
  38. # Convert all unbreakable spaces in the line
  39. temp_line="${line//'\~'/' '}"
  40. i=$(echo "${temp_line}" | grep "uc0")
  41. while [[ "$i" != "" ]]
  42. do
  43. # Extract the utf16 code eg: \uc0\u171 -> 171
  44. temp=$(echo "${temp_line#*'\uc0\u'}")
  45. code=$(echo "${temp%%' '*}")
  46. # Convert the code to hexa eg: 171 -> 0x00ab
  47. char=$(printf "0x%04x" ${code})
  48. # Convert to unicode signal (first 0x should be \u) eg: 0x00ab -> \u00ab (substitution) -> « (echo -ne)
  49. # |tr -d '\0' remove a warning introduced in bash 4.4, see https://lists.gnu.org/archive/html/bug-bash/2016-09/msg00015.html
  50. char=$(echo -ne "${char/0x/'\u'}" |tr -d '\0')
  51. # Insert the new char in the line
  52. temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"' /'"$char"'/g')
  53. # In case the char is end of line, insert with no space
  54. temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"'/'"$char"'/g')
  55. i=$(echo "${temp_line}" | grep "uc0")
  56. done
  57. echo ${temp_line}
  58. }
  59. comment_event=""
  60. comment_node=""
  61. comment_story=""
  62. comment_out=""
  63. msgstr=""
  64. msgid=""
  65. out_incr=1
  66. story_first_line=0
  67. while read -r line; do
  68. if [[ ${line} == *"-- [EVENT]"* ]]; then
  69. comment_event="${line}"
  70. elif [[ ${line} == *"+[NODE]"* ]]; then
  71. comment_node="${line}"
  72. elif [[ ${line} == *"[STORY]"* ]]; then
  73. comment_story="${comment_event}@@${comment_node}@@${line}"
  74. # Clean strings
  75. comment_story="${comment_story//'\noproof '/}"
  76. comment_story="${comment_story//'\cell }'/}"
  77. # Time for new story, write the comment to pot
  78. echo "#. ${comment_story}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  79. echo >> "${TEMP_PO}"
  80. story_first_line=1
  81. elif [[ ${line} == *"[/STORY]"* ]]; then
  82. # we reach end of story, insert the translated msgstr and reinit
  83. msgstr="${msgstr//'\lang1036 '/}"
  84. msgstr="${msgstr//'\cell }'/}"
  85. # If there is still lang1036 then it means the msgstr is untranslated, insert an empty translation
  86. if [[ ${msgstr} == *"lang1036"* ]]; then
  87. echo "msgstr \"\"" >> "${TEMP_PO}"
  88. echo >> "${TEMP_PO}"
  89. else
  90. echo "msgstr \"\"${msgstr}" >> "${TEMP_PO}"
  91. echo >> "${TEMP_PO}"
  92. fi
  93. comment_story=""
  94. msgstr=""
  95. elif [[ ${line} == *"\noproof [OUT]"* ]]; then
  96. # It's an out for english part (with noproof) so write the comment_out
  97. comment_out="${comment_event}@@${comment_node}@@[OUT]${out_incr}"
  98. comment_out="${comment_out//'\noproof '/}"
  99. comment_out="${comment_out//'\cell }'/}"
  100. # OUT are one line, let's write it to the po
  101. echo "#. ${comment_out}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  102. echo >> "${TEMP_PO}"
  103. line="${line//'\cell }'/}"
  104. line="${line//'\noproof '/}"
  105. # Get rid of special character
  106. line="${line//'"'/'\"'}"
  107. echo "msgid \"${line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  108. echo >> "${TEMP_PO}"
  109. # Insert an empty msgstr in case of untranslated [OUT]
  110. # (in this case it never matches the \lang1036 [OUT] below so we need to insert it outside)
  111. echo "msgstr \"\"" >> "${TEMP_PO}"
  112. echo >> "${TEMP_PO}"
  113. ((out_incr++))
  114. elif [[ ${line} == *"\lang1036 [OUT]"* ]]; then
  115. # It's an out for translated part (with lang1036) so write the msgstr.
  116. line="${line//'\cell }'/}"
  117. line="${line//'\lang1036 '/}"
  118. # Get rid of special character
  119. line="${line//'"'/'\"'}"
  120. encoded_line=$(convert_char_to_utf8 "${line}")
  121. # Insert without the msgstr as it has been inserted with the msgid before.
  122. echo "\"${encoded_line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  123. echo >> "${TEMP_PO}"
  124. elif [[ ${line} == *"[/NODE]"* ]]; then
  125. # We reach end of node, reinit
  126. comment_node=""
  127. comment_story=""
  128. comment_out=""
  129. out_incr=1
  130. elif [[ ${line} == *"[/EVENT]"* ]]; then
  131. # We reach end of event, reinit
  132. comment_event=""
  133. comment_node=""
  134. comment_story=""
  135. comment_out=""
  136. out_incr=1
  137. elif [[ "${comment_story}" != "" && ${story_first_line} == 1 ]]; then
  138. # As developer sometimes forgot empty lines at the end of STORY, skip empty lines
  139. if [[ "${line}" = "" ]]; then
  140. continue
  141. fi
  142. # If the line has nothing particular, and the comment_story is not empty, it's a story string
  143. # As the story_fist_line is set, it's the first line of the story, insert the msgid
  144. line="${line//'\cell }'/}"
  145. # Get rid of special character
  146. line="${line//'"'/'\"'}"
  147. echo "${line//'\noproof '/'msgid "'}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  148. echo >> "${TEMP_PO}"
  149. story_first_line=0
  150. elif [[ ${comment_story} != "" ]]; then
  151. # As developer sometimes forgot empty lines at the end of STORY, skip empty lines
  152. if [[ "${line}" = "" ]]; then
  153. continue
  154. fi
  155. # If it contains lang1036 it's a translated line, keep it for later
  156. if [[ $(echo "${line}" | grep "lang1036") ]]; then
  157. # Get rid of special character
  158. line="${line//'"'/'\"'}"
  159. encoded_line=$(convert_char_to_utf8 "${line}")
  160. # We need a newline between each translated lines
  161. msgstr="${msgstr}
  162. \"${encoded_line}\n\""
  163. continue
  164. fi
  165. # Also as this is multiline, we need to add \n to the previous line (no way to detect it sooner...)
  166. sed -e '$s/\(.*\)"$/\1\\n"/' -i "${TEMP_PO}"
  167. line="${line//'\cell }'/}"
  168. line="${line//'\noproof '/}"
  169. # Get rid of special character
  170. line="${line//'"'/'\"'}"
  171. echo "\"${line}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  172. echo >> "${TEMP_PO}"
  173. else
  174. # It should be a newline, write a newline
  175. # If new case are added later, we need to deal with it manually, do not break the file by writing them
  176. echo "" >> "${TEMP_PO}"
  177. fi
  178. done < "${WORKING_RTF}"
  179. # Insert default po header
  180. echo 'msgid ""
  181. msgstr ""
  182. "MIME-Version: 1.0\n"
  183. "Content-Transfer-Encoding: 8bit\n"
  184. "Content-Type: text/plain; charset=UTF-8\n"
  185. "Project-Id-Version: \n"
  186. "POT-Creation-Date: \n"
  187. "PO-Revision-Date: \n"
  188. "Last-Translator: \n"
  189. "Language-Team: \n"' | cat - "${TEMP_PO}" > "${TEMP_PO}.tmp" && mv "${TEMP_PO}.tmp" "${TEMP_PO}"
  190. # Unify duplicate due to pot syntax
  191. msguniq --no-wrap "${TEMP_PO}" > "${PO}"