#!/usr/bin/env bash ## RSS to PDF ## This is a tool which accepts a single RSS feed, and produces a number ## of article PDFs in the format [sectiontitle-hash.pdf]. Naming it this ## way has the added benefit of random sorting within each section. ## Usage: ## cat "input.rss" | rsstopdf "output.pdf" # Dump xml from stdin rawxmlfile="$(mktemp)" tmpdir="$(mktemp -d)" cat > "$rawxmlfile" scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" function tex_escape { sed "s|’|'|g" \ | sed 's|{|\{|g' \ | sed 's|}|\}|g' \ | sed 's|~|\\textasciitilde{}|g' \ | sed 's|\\|\\textbackslash{}|g' \ | sed 's|\^|\\textasciicircum{}|g' \ | sed 's|\#|\\\#|g' \ | sed 's|\$|\\\$|g' \ | sed 's|%|\%|g' \ | sed 's|&|\\&|g' \ | sed 's|_|\_|g' } function tex_escape2 { pandoc --standalone -f html -t latex \ | tail -n +50 | head -n -2 \ | sed 's|\\rule{3in}|\\rule{1in}|g' } # Need to know date, category, title, author, and body suffixes=("th" "st" "nd" "rd" "th" "th" "th" "th" "th" "th") date="$(date '+%A, %B %d, %Y' | sed 's/ 0/ /g')" dayindex="$(date "+%A, %B %d, %Y" \ | sed 's/ 0/ /g' \ | awk -F' ' '{print $3}' \ | tr -d ',' \ | tail -c 2)" daysuffix=${suffixes[$dayindex]} # Tuesday, October 2nd, 2018 datestring="$(echo "$date" | \ awk -F ',' "{print \$1 \",\" \$2 \"$daysuffix,\" \$3}")" title_delim="_" # Because we use `cut`, keep this a single character # Poor man's hash-map titles=() title_hashes=() function get_title_from_hash { for ((i=0; i<${#titles[@]}; i++)) do title="${titles[$i]}" hash="${title_hashes[$i]}" if [[ "$hash" == "$1" ]] then echo "$title" exit fi done } numberofitems="$(xpath "$rawxmlfile" 'count(/rss/channel/item)')" for ((i=1; i<=$numberofitems; i++)) do #echo $i node="/rss/channel/item[$i]" # Collect category="$(cat "$rawxmlfile" \ | xpath "$node/category/text()" \ | tex_escape)" real_title="$(cat "$rawxmlfile" \ | xpath "$node/title/text()")" title="$(echo "$real_title" \ | tex_escape)" author="$(cat "$rawxmlfile" \ | xpath "$node/author/text()" \ | sed 's/.*(\(.*\))/\1/' \ | tex_escape2)" org="NPR" body="$(cat "$rawxmlfile" \ | xpath "$node/content:encoded/text()" \ | base64 -d \ | tex_escape2)" # Create a poor-man's hashmap of the titles. Later in the process, # we name the PDF files after the title hashes, but we must re- # connect them. This device holds that association. titles+=( "$real_title" ) hash="$(echo "$real_title" | md5sum | cut -d ' ' -f 1)" title_hashes+=( "$hash" ) # Replace the last word of the body with an \hbox and \textbullet # ... # Mix with tex template texout="$tmpdir/${category}${title_delim}${hash}.tex" cat "$scriptdir/standard.tex.template" \ | sed "s|~~category~~|$category|g" \ | sed "s|~~title~~|$title|g" \ | sed "s|~~date~~|$datestring|g" \ | sed "s|~~author~~|$author|g" \ | sed "s|~~org~~|$org|g" \ > "$texout" # Put the body in the middle. Because of the varaince of characters, # this was not done using `sed`. echo "$body" >> "$texout" # Put in a small footer cat >> "$texout" < "$alltex" echo "$latex_pdf_includes" >> "$alltex" echo "$latex_bookmarks" >> "$alltex" echo "\end{document}" >> "$alltex" # Process tex file into PDF; stick into output dir (cd "$tmpdir" && pdflatex \ -halt-on-error \ -output-directory "$tmpdir" \ "$alltex") #rm -rf "$tmpdir" "$rawxmlfile" echo "$tmpdir/all.pdf" # c all the PDFs and organize them by category, then create the final # PDF # Echo the location of the final pdf