|
|
@ -0,0 +1,192 @@ |
|
|
|
#!/usr/bin/env bash |
|
|
|
|
|
|
|
# This script export a multi-lanbguage RTF export from memoQ to a gettext .po |
|
|
|
|
|
|
|
if [[ "$#" -ne 1 ]]; then |
|
|
|
echo "Please pass exactly 1 parameter: the rtf to convert" |
|
|
|
exit 1 |
|
|
|
fi |
|
|
|
|
|
|
|
ORIGINAL_RTF="$1" |
|
|
|
WORKING_RTF="$1.work" |
|
|
|
PO_DIR="$(pwd)/" |
|
|
|
TEMP_DIR="$(mktemp -d)" |
|
|
|
PO=$(basename "${ORIGINAL_RTF%%.txt_fre.rtf}.po") |
|
|
|
TEMP_PO="${TEMP_DIR}/${PO}" |
|
|
|
PO="${PO_DIR}/${PO}" |
|
|
|
|
|
|
|
cat "${ORIGINAL_RTF}" > "${WORKING_RTF}" |
|
|
|
|
|
|
|
dos2unix "${WORKING_RTF}" &> /dev/null |
|
|
|
|
|
|
|
#Remove all unused lines |
|
|
|
sed -i '/brdrcf17/d' "${WORKING_RTF}" |
|
|
|
sed -i '/ltrpar/d' "${WORKING_RTF}" |
|
|
|
sed -i '/fs16/d' "${WORKING_RTF}" |
|
|
|
sed -i '/\\row/d' "${WORKING_RTF}" |
|
|
|
sed -i 's/\\\\//g' "${WORKING_RTF}" |
|
|
|
sed -i 's/{\\rtlch\\fcs1\ \\ltrch\\fcs0//g' "${WORKING_RTF}" |
|
|
|
sed -i 's/\\noproof \\cell }//g' "${WORKING_RTF}" |
|
|
|
sed -i 's/\\cell }//g' "${WORKING_RTF}" |
|
|
|
sed -i 's/{\\plain\\noproof\ \\cs99\\f0\\fs20\\cf13\ \\{MQ\\}}//g' "${WORKING_RTF}" |
|
|
|
#Remove useless translations of EVENT, NODE, STORY |
|
|
|
sed -i '/\lang1036 -- \[EVENT\]/d' "${WORKING_RTF}" |
|
|
|
sed -i '/\lang1036 +\[NODE\]/d' "${WORKING_RTF}" |
|
|
|
sed -i '/\lang1036 \[STORY\]/d' "${WORKING_RTF}" |
|
|
|
sed -i '/\lang1036 \[\/EVENT\]/d' "${WORKING_RTF}" |
|
|
|
sed -i '/\lang1036 \[\/NODE\]/d' "${WORKING_RTF}" |
|
|
|
sed -i '/\lang1036 \[\/STORY\]/d' "${WORKING_RTF}" |
|
|
|
# Remove headers |
|
|
|
sed -i '1,23d' "${WORKING_RTF}" |
|
|
|
|
|
|
|
# Quick function to deal with \uc0\uXXX char in rtf format, as iconv and other tool cannot detect them... |
|
|
|
# In addition to that, rtf adds a useless space after each encoded char that need to be corrected manually |
|
|
|
function convert_char_to_utf8 { |
|
|
|
# Convert all unbreakable spaces in the line |
|
|
|
temp_line="${line//'\~'/' '}" |
|
|
|
i=$(echo "${temp_line}" | grep "uc0") |
|
|
|
while [[ "$i" != "" ]] |
|
|
|
do |
|
|
|
# Extract the utf16 code eg: \uc0\u171 -> 171 |
|
|
|
temp=$(echo "${temp_line#*'\uc0\u'}") |
|
|
|
code=$(echo "${temp%%' '*}") |
|
|
|
# Convert the code to hexa eg: 171 -> 0x00ab |
|
|
|
char=$(printf "0x%04x" ${code}) |
|
|
|
# Convert to unicode signal (first 0x should be \u) eg: 0x00ab -> \u00ab (substitution) -> « (echo -ne) |
|
|
|
# |tr -d '\0' remove a warning introduced in bash 4.4, see https://lists.gnu.org/archive/html/bug-bash/2016-09/msg00015.html |
|
|
|
char=$(echo -ne "${char/0x/'\u'}" |tr -d '\0') |
|
|
|
# Insert the new char in the line |
|
|
|
temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"' /'"$char"'/g') |
|
|
|
# In case the char is end of line, insert with no space |
|
|
|
temp_line=$(echo "${temp_line}" | sed 's/\\uc0\\u'"$code"'/'"$char"'/g') |
|
|
|
i=$(echo "${temp_line}" | grep "uc0") |
|
|
|
done |
|
|
|
|
|
|
|
echo ${temp_line} |
|
|
|
} |
|
|
|
|
|
|
|
comment_event="" |
|
|
|
comment_node="" |
|
|
|
comment_story="" |
|
|
|
comment_out="" |
|
|
|
msgstr="" |
|
|
|
msgid="" |
|
|
|
out_incr=1 |
|
|
|
story_first_line=0 |
|
|
|
while read -r line; do |
|
|
|
if [[ ${line} == *"-- [EVENT]"* ]]; then |
|
|
|
comment_event="${line}" |
|
|
|
elif [[ ${line} == *"+[NODE]"* ]]; then |
|
|
|
comment_node="${line}" |
|
|
|
elif [[ ${line} == *"[STORY]"* ]]; then |
|
|
|
comment_story="${comment_event}@@${comment_node}@@${line}" |
|
|
|
# Clean strings |
|
|
|
comment_story="${comment_story//'\noproof '/}" |
|
|
|
comment_story="${comment_story//'\cell }'/}" |
|
|
|
# Time for new story, write the comment to pot |
|
|
|
echo "#. ${comment_story}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" |
|
|
|
echo >> "${TEMP_PO}" |
|
|
|
story_first_line=1 |
|
|
|
elif [[ ${line} == *"[/STORY]"* ]]; then |
|
|
|
# we reach end of story, insert the translated msgstr and reinit |
|
|
|
msgstr="${msgstr//'\lang1036 '/}" |
|
|
|
msgstr="${msgstr//'\cell }'/}" |
|
|
|
echo "msgstr \"\"${msgstr}" >> "${TEMP_PO}" |
|
|
|
echo >> "${TEMP_PO}" |
|
|
|
comment_story="" |
|
|
|
msgstr="" |
|
|
|
elif [[ ${line} == *"\noproof [OUT]"* ]]; then |
|
|
|
# It's an out for english part (with noproof) so write the comment_out |
|
|
|
comment_out="${comment_event}@@${comment_node}@@[OUT]${out_incr}" |
|
|
|
comment_out="${comment_out//'\noproof '/}" |
|
|
|
comment_out="${comment_out//'\cell }'/}" |
|
|
|
# OUT are one line, let's write it to the po |
|
|
|
echo "#. ${comment_out}" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" |
|
|
|
echo >> "${TEMP_PO}" |
|
|
|
line="${line//'\cell }'/}" |
|
|
|
line="${line//'\noproof '/}" |
|
|
|
# Get rid of special character |
|
|
|
line="${line//'"'/'\"'}" |
|
|
|
echo "msgid \"${line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" |
|
|
|
echo >> "${TEMP_PO}" |
|
|
|
((out_incr++)) |
|
|
|
elif [[ ${line} == *"\lang1036 [OUT]"* ]]; then |
|
|
|
# It's an out for translated part (with lang1036) so write the msgstr |
|
|
|
line="${line//'\cell }'/}" |
|
|
|
line="${line//'\lang1036 '/}" |
|
|
|
# Get rid of special character |
|
|
|
line="${line//'"'/'\"'}" |
|
|
|
encoded_line=$(convert_char_to_utf8 "${line}") |
|
|
|
echo "msgstr \"${encoded_line#\[OUT\]}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" |
|
|
|
echo >> "${TEMP_PO}" |
|
|
|
elif [[ ${line} == *"[/NODE]"* ]]; then |
|
|
|
# We reach end of node, reinit |
|
|
|
comment_node="" |
|
|
|
comment_story="" |
|
|
|
comment_out="" |
|
|
|
out_incr=1 |
|
|
|
elif [[ ${line} == *"[/EVENT]"* ]]; then |
|
|
|
# We reach end of event, reinit |
|
|
|
comment_event="" |
|
|
|
comment_node="" |
|
|
|
comment_story="" |
|
|
|
comment_out="" |
|
|
|
out_incr=1 |
|
|
|
elif [[ "${comment_story}" != "" && ${story_first_line} == 1 ]]; then |
|
|
|
# As developer sometimes forgot empty lines at the end of STORY, skip empty lines |
|
|
|
if [[ "${line}" = "" ]]; then |
|
|
|
continue |
|
|
|
fi |
|
|
|
# If the line has nothing particular, and the comment_story is not empty, it's a story string |
|
|
|
# As the story_fist_line is set, it's the first line of the story, insert the msgid |
|
|
|
line="${line//'\cell }'/}" |
|
|
|
# Get rid of special character |
|
|
|
line="${line//'"'/'\"'}" |
|
|
|
echo "${line//'\noproof '/'msgid "'}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" |
|
|
|
echo >> "${TEMP_PO}" |
|
|
|
story_first_line=0 |
|
|
|
elif [[ ${comment_story} != "" ]]; then |
|
|
|
# As developer sometimes forgot empty lines at the end of STORY, skip empty lines |
|
|
|
if [[ "${line}" = "" ]]; then |
|
|
|
continue |
|
|
|
fi |
|
|
|
# If it contains lang1036 it's a translated line, keep it for later |
|
|
|
if [[ $(echo "${line}" | grep "lang1036") ]]; then |
|
|
|
# Get rid of special character |
|
|
|
line="${line//'"'/'\"'}" |
|
|
|
# We need a newline between each translated lines |
|
|
|
encoded_line=$(convert_char_to_utf8 "${line}") |
|
|
|
msgstr="${msgstr} |
|
|
|
\"${encoded_line}\n\"" |
|
|
|
continue |
|
|
|
fi |
|
|
|
# Also as this is multiline, we need to add \n to the previous line (no way to detect it sooner...) |
|
|
|
sed -e '$s/\(.*\)"$/\1\\n"/' -i "${TEMP_PO}" |
|
|
|
line="${line//'\cell }'/}" |
|
|
|
line="${line//'\noproof '/}" |
|
|
|
# Get rid of special character |
|
|
|
line="${line//'"'/'\"'}" |
|
|
|
echo "\"${line}\"" | tr -d '\r' | tr -d '\n' >> "${TEMP_PO}" |
|
|
|
echo >> "${TEMP_PO}" |
|
|
|
else |
|
|
|
# It should be a newline, write a newline |
|
|
|
# If new case are added later, we need to deal with it manually, do not break the file by writing them |
|
|
|
echo "" >> "${TEMP_PO}" |
|
|
|
fi |
|
|
|
|
|
|
|
done < "${WORKING_RTF}" |
|
|
|
|
|
|
|
# Insert default po header |
|
|
|
echo 'msgid "" |
|
|
|
msgstr "" |
|
|
|
"MIME-Version: 1.0\n" |
|
|
|
"Content-Transfer-Encoding: 8bit\n" |
|
|
|
"Content-Type: text/plain; charset=UTF-8\n" |
|
|
|
"Project-Id-Version: \n" |
|
|
|
"POT-Creation-Date: \n" |
|
|
|
"PO-Revision-Date: \n" |
|
|
|
"Last-Translator: \n" |
|
|
|
"Language-Team: \n"' | cat - "${TEMP_PO}" > "${TEMP_PO}.tmp" && mv "${TEMP_PO}.tmp" "${TEMP_PO}" |
|
|
|
|
|
|
|
# Unify duplicate due to pot syntax |
|
|
|
msguniq --no-wrap "${TEMP_PO}" > "${PO}" |