Scripting tools to interact with Thea 2 The Shattering files in order to translate them easily.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

191 lines
7.4 KiB

  1. #!/usr/bin/env bash
  2. # This script export a multi-lanbguage RTF export from memoQ to a gettext .po
  3. if [[ "$#" -ne 1 ]]; then
  4. echo "Please pass exactly 1 parameter: the rtf to convert"
  5. exit 1
  6. fi
  7. ORIGINAL_RTF="$1"
  8. WORKING_RTF="$1.work"
  9. PO_DIR="$(pwd)/"
  10. TEMP_DIR="$(mktemp -d)"
  11. PO=$(basename "${ORIGINAL_RTF%%.txt_fre.rtf}.po")
  12. TEMP_PO="${TEMP_DIR}/${PO}"
  13. PO="${PO_DIR}/${PO}"
  14. cat "${ORIGINAL_RTF}" > "${WORKING_RTF}"
  15. dos2unix "${WORKING_RTF}" &> /dev/null
  16. #Remove all unused lines
  17. sed -i '/brdrcf17/d' "${WORKING_RTF}"
  18. sed -i '/ltrpar/d' "${WORKING_RTF}"
  19. sed -i '/fs16/d' "${WORKING_RTF}"
  20. sed -i '/\\row/d' "${WORKING_RTF}"
  21. sed -i 's/\\\\//g' "${WORKING_RTF}"
  22. sed -i 's/{\\rtlch\\fcs1\ \\ltrch\\fcs0//g' "${WORKING_RTF}"
  23. sed -i 's/\\noproof \\cell }//g' "${WORKING_RTF}"
  24. sed -i 's/\\cell }//g' "${WORKING_RTF}"
  25. sed -i 's/{\\plain\\noproof\ \\cs99\\f0\\fs20\\cf13\ \\{MQ\\}}//g' "${WORKING_RTF}"
  26. #Remove useless translations of EVENT, NODE, STORY
  27. sed -i '/\lang1036 -- \[EVENT\]/d' "${WORKING_RTF}"
  28. sed -i '/\lang1036 +\[NODE\]/d' "${WORKING_RTF}"
  29. sed -i '/\lang1036 \[STORY\]/d' "${WORKING_RTF}"
  30. sed -i '/\lang1036 \[\/EVENT\]/d' "${WORKING_RTF}"
  31. sed -i '/\lang1036 \[\/NODE\]/d' "${WORKING_RTF}"
  32. sed -i '/\lang1036 \[\/STORY\]/d' "${WORKING_RTF}"
  33. # Remove headers
  34. sed -i '1,23d' "${WORKING_RTF}"
  35. # Quick function to deal with \uc0\uXXX char in rtf format, as iconv and other tool cannot detect them...
  36. # In addition to that, rtf adds a useless space after each encoded char that need to be corrected manually
  37. function convert_char_to_utf8 {
  38. # Convert all unbreakable spaces in the line
  39. temp_line="${line//'\~'/' '}"
  40. i=$(echo "${temp_line}" | grep "uc0")
  41. while [[ "$i" != "" ]]
  42. do
  43. # Extract the utf16 code eg: \uc0\u171 -> 171
  44. temp=$(echo "${temp_line#*'\uc0\u'}")
  45. code=$(echo "${temp%%' '*}")
  46. # Convert the code to hexa eg: 171 -> 0x00ab
  47. char=$(printf "0x%04x" ${code})
  48. # Convert to unicode signal (first 0x should be \u) eg: 0x00ab -> \u00ab (substitution) -> « (echo -ne)
  49. # |tr -d '\0' remove a warning introduced in bash 4.4, see https://lists.gnu.org/archive/html/bug-bash/2016-09/msg00015.html
  50. char=$(echo -ne "${char/0x/'\u'}" |tr -d '\0')
  51. # Insert the new char in the line
  52. temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"' /'"$char"'/g')
  53. # In case the char is end of line, insert with no space
  54. temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"'/'"$char"'/g')
  55. i=$(echo "${temp_line}" | grep "uc0")
  56. done
  57. echo ${temp_line}
  58. }
  59. comment_event=""
  60. comment_node=""
  61. comment_story=""
  62. comment_out=""
  63. msgstr=""
  64. msgid=""
  65. out_incr=1
  66. story_first_line=0
  67. while read -r line; do
  68. if [[ ${line} == *"-- [EVENT]"* ]]; then
  69. comment_event="${line}"
  70. elif [[ ${line} == *"+[NODE]"* ]]; then
  71. comment_node="${line}"
  72. elif [[ ${line} == *"[STORY]"* ]]; then
  73. comment_story="${comment_event}@@${comment_node}@@${line}"
  74. # Clean strings
  75. comment_story="${comment_story//'\noproof '/}"
  76. comment_story="${comment_story//'\cell }'/}"
  77. # Time for new story, write the comment to pot
  78. echo "#. ${comment_story}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  79. echo >> "${TEMP_PO}"
  80. story_first_line=1
  81. elif [[ ${line} == *"[/STORY]"* ]]; then
  82. # we reach end of story, insert the translated msgstr and reinit
  83. msgstr="${msgstr//'\lang1036 '/}"
  84. msgstr="${msgstr//'\cell }'/}"
  85. echo "msgstr \"\"${msgstr}" >> "${TEMP_PO}"
  86. echo >> "${TEMP_PO}"
  87. comment_story=""
  88. msgstr=""
  89. elif [[ ${line} == *"\noproof [OUT]"* ]]; then
  90. # It's an out for english part (with noproof) so write the comment_out
  91. comment_out="${comment_event}@@${comment_node}@@[OUT]${out_incr}"
  92. comment_out="${comment_out//'\noproof '/}"
  93. comment_out="${comment_out//'\cell }'/}"
  94. # OUT are one line, let's write it to the po
  95. echo "#. ${comment_out}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  96. echo >> "${TEMP_PO}"
  97. line="${line//'\cell }'/}"
  98. line="${line//'\noproof '/}"
  99. # Get rid of special character
  100. line="${line//'"'/'\"'}"
  101. echo "msgid \"${line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  102. echo >> "${TEMP_PO}"
  103. ((out_incr++))
  104. elif [[ ${line} == *"\lang1036 [OUT]"* ]]; then
  105. # It's an out for translated part (with lang1036) so write the msgstr
  106. line="${line//'\cell }'/}"
  107. line="${line//'\lang1036 '/}"
  108. # Get rid of special character
  109. line="${line//'"'/'\"'}"
  110. encoded_line=$(convert_char_to_utf8 "${line}")
  111. echo "msgstr \"${encoded_line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  112. echo >> "${TEMP_PO}"
  113. elif [[ ${line} == *"[/NODE]"* ]]; then
  114. # We reach end of node, reinit
  115. comment_node=""
  116. comment_story=""
  117. comment_out=""
  118. out_incr=1
  119. elif [[ ${line} == *"[/EVENT]"* ]]; then
  120. # We reach end of event, reinit
  121. comment_event=""
  122. comment_node=""
  123. comment_story=""
  124. comment_out=""
  125. out_incr=1
  126. elif [[ "${comment_story}" != "" && ${story_first_line} == 1 ]]; then
  127. # As developer sometimes forgot empty lines at the end of STORY, skip empty lines
  128. if [[ "${line}" = "" ]]; then
  129. continue
  130. fi
  131. # If the line has nothing particular, and the comment_story is not empty, it's a story string
  132. # As the story_fist_line is set, it's the first line of the story, insert the msgid
  133. line="${line//'\cell }'/}"
  134. # Get rid of special character
  135. line="${line//'"'/'\"'}"
  136. echo "${line//'\noproof '/'msgid "'}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  137. echo >> "${TEMP_PO}"
  138. story_first_line=0
  139. elif [[ ${comment_story} != "" ]]; then
  140. # As developer sometimes forgot empty lines at the end of STORY, skip empty lines
  141. if [[ "${line}" = "" ]]; then
  142. continue
  143. fi
  144. # If it contains lang1036 it's a translated line, keep it for later
  145. if [[ $(echo "${line}" | grep "lang1036") ]]; then
  146. # Get rid of special character
  147. line="${line//'"'/'\"'}"
  148. # We need a newline between each translated lines
  149. encoded_line=$(convert_char_to_utf8 "${line}")
  150. msgstr="${msgstr}
  151. \"${encoded_line}\n\""
  152. continue
  153. fi
  154. # Also as this is multiline, we need to add \n to the previous line (no way to detect it sooner...)
  155. sed -e '$s/\(.*\)"$/\1\\n"/' -i "${TEMP_PO}"
  156. line="${line//'\cell }'/}"
  157. line="${line//'\noproof '/}"
  158. # Get rid of special character
  159. line="${line//'"'/'\"'}"
  160. echo "\"${line}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}"
  161. echo >> "${TEMP_PO}"
  162. else
  163. # It should be a newline, write a newline
  164. # If new case are added later, we need to deal with it manually, do not break the file by writing them
  165. echo "" >> "${TEMP_PO}"
  166. fi
  167. done < "${WORKING_RTF}"
  168. # Insert default po header
  169. echo 'msgid ""
  170. msgstr ""
  171. "MIME-Version: 1.0\n"
  172. "Content-Transfer-Encoding: 8bit\n"
  173. "Content-Type: text/plain; charset=UTF-8\n"
  174. "Project-Id-Version: \n"
  175. "POT-Creation-Date: \n"
  176. "PO-Revision-Date: \n"
  177. "Last-Translator: \n"
  178. "Language-Team: \n"' | cat - "${TEMP_PO}" > "${TEMP_PO}.tmp" && mv "${TEMP_PO}.tmp" "${TEMP_PO}"
  179. # Unify duplicate due to pot syntax
  180. msguniq --no-wrap "${TEMP_PO}" > "${PO}"