#!/usr/bin/env bash

## RSS to PDF
## This is a tool which accepts a single RSS feed, and produces a number
## of article PDFs in the format [sectiontitle-hash.pdf]. Naming it this
## way has the added benefit of random sorting within each section.

## Usage:
##   cat "input.rss" | rsstopdf "output.pdf"

# Dump xml from stdin
rawxmlfile="$(mktemp)"
tmpdir="$(mktemp -d)"
cat > "$rawxmlfile"

scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
function tex_escape {
    sed "s|&#8217;|'|g" \
    | sed 's|{|\{|g' \
    | sed 's|}|\}|g' \
    | sed 's|~|\\textasciitilde{}|g' \
    | sed 's|\\|\\textbackslash{}|g' \
    | sed 's|\^|\\textasciicircum{}|g' \
    | sed 's|\#|\\\#|g' \
    | sed 's|\$|\\\$|g' \
    | sed 's|%|\%|g' \
    | sed 's|&|\\&|g' \
    | sed 's|_|\_|g'
}

function tex_escape2 {
    pandoc --standalone -f html -t latex \
        | tail -n +50 | head -n -2 \
        | sed 's|\\rule{3in}|\\rule{1in}|g'
}

# Need to know date, category, title, author, and body
suffixes=("th" "st" "nd" "rd" "th" "th" "th" "th" "th" "th")
date="$(date '+%A, %B %d, %Y' | sed 's/ 0/ /g')"
dayindex="$(date "+%A, %B %d, %Y" \
    | sed 's/ 0/ /g' \
    | awk -F' ' '{print $3}' \
    | tr -d ',' \
    | tail -c 2)"
daysuffix=${suffixes[$dayindex]}
# Tuesday, October 2nd, 2018
datestring="$(echo "$date" | \
    awk -F ',' "{print \$1 \",\" \$2 \"$daysuffix,\" \$3}")"

title_delim="_" # Because we use `cut`, keep this a single character

# Poor man's hash-map
titles=()
title_hashes=()

function get_title_from_hash {
    for ((i=0; i<${#titles[@]}; i++))
    do
        title="${titles[$i]}"
        hash="${title_hashes[$i]}"

        if [[ "$hash" == "$1" ]]
        then
            echo "$title"
            exit
        fi
    done
}

numberofitems="$(xpath "$rawxmlfile" 'count(/rss/channel/item)')"
for ((i=1; i<=$numberofitems; i++))
do
    #echo $i
    node="/rss/channel/item[$i]"

    # Collect
    category="$(cat "$rawxmlfile" \
        | xpath "$node/category/text()" \
        | tex_escape)"
    real_title="$(cat "$rawxmlfile" \
        | xpath "$node/title/text()")"
    title="$(echo "$real_title" \
        | tex_escape)"
    author="$(cat "$rawxmlfile" \
        | xpath "$node/author/text()" \
        | sed 's/.*(\(.*\))/\1/' \
        | tex_escape2)"
    org="NPR"
    body="$(cat "$rawxmlfile" \
        | xpath "$node/content:encoded/text()" \
        | base64 -d \
        | tex_escape2)"

    # Create a poor-man's hashmap of the titles. Later in the process,
    # we name the PDF files after the title hashes, but we must re-
    # connect them. This device holds that association.
    titles+=( "$real_title" )
    hash="$(echo "$real_title" | md5sum | cut -d ' ' -f 1)"
    title_hashes+=( "$hash" )

    # Replace the last word of the body with an \hbox and \textbullet
    # ...

    # Mix with tex template
    texout="$tmpdir/${category}${title_delim}${hash}.tex"
    cat "$scriptdir/standard.tex.template" \
        | sed "s|~~category~~|$category|g" \
        | sed "s|~~title~~|$title|g" \
        | sed "s|~~date~~|$datestring|g" \
        | sed "s|~~author~~|$author|g" \
        | sed "s|~~org~~|$org|g" \
        > "$texout"
    # Put the body in the middle. Because of the varaince of characters,
    # this was not done using `sed`.
    echo "$body" >> "$texout"
    # Put in a small footer
    cat >> "$texout" <<EOF
\end{multicols*}
\newpage
\end{document}
EOF

    # Latex twice to resolve page numbers
    for ((n=0; n<2; n++))
    do
        pdflatex \
            -halt-on-error \
            -output-directory "$tmpdir" \
            "$texout"
    done
done

# Create the bookmarks list--count the number of articles in each
# category, so we can build a nested list. This step, unfortunately,
# kills any links we had in the original documents.
readarray categories < <(ls $tmpdir/*.pdf \
    | awk -F'/' '{print $NF}' \
    | awk '{print $NF}' \
    | cut -d "$title_delim" -f 1 \
    | sort | uniq)

latex_pdf_includes="$mktemp"
page_count=1
latex_bookmarks=""

for category in ${categories[@]}
do
    articles=()
    readarray articles < <(find "$tmpdir" \
        -name "${category}${title_delim}*.pdf")

    latex_bookmarks+="\\bookmark[page=$page_count,level=0]{$category}"$'\n'

    for article_path in "${articles[@]}"
    do
        article_path="$(echo "$article_path" | head -n 1)"

        # Get the title from our hashmap
        hash="$(echo "$article_path" \
            | awk -F'/' '{print $NF}' \
            | awk -F"$title_delim" '{print $NF}' \
            | sed 's/\.pdf//g')"
        title="$(get_title_from_hash "$hash")"
        latex_pdf_includes+="\\pagenumbering{arabic}"$'\n'
        latex_pdf_includes+="\\setcounter{page}{1}"$'\n'
        latex_pdf_includes+="\\includepdf[pages=1-]{${category}${title_delim}${hash}.pdf}"$'\n'

        latex_bookmarks+="\\bookmark[page=$page_count,level=1]{$title}"$'\n'

        article_page_count=$(pdfinfo "$article_path" \
            | grep Pages \
            | awk '{print $NF}')
        page_count=$((page_count+article_page_count))
    done
done

# Bookmarks
# LaTeX is again used to generate PDF bookmarks
alltex="$tmpdir/all.tex"
cat "$scriptdir/bookmarks_header.tex" > "$alltex"
echo "$latex_pdf_includes" >> "$alltex"
echo "$latex_bookmarks" >> "$alltex"
echo "\end{document}" >> "$alltex"


# Process tex file into PDF; stick into output dir
(cd "$tmpdir" && pdflatex \
    -halt-on-error \
    -output-directory "$tmpdir" \
    "$alltex")
#rm -rf "$tmpdir" "$rawxmlfile"
echo "$tmpdir/all.pdf"
# c all the PDFs and organize them by category, then create the final
# PDF

# Echo the location of the final pdf