#!/usr/bin/env bash ## RSS to PDF ## This is a tool which accepts a single RSS feed, and produces a number ## of article PDFs in the format [sectiontitle-hash.pdf]. Naming it this ## way has the added benefit of random sorting within each section. ## Usage: ## cat input.rss | rsstopdf # Dump xml from stdin rawxmlfile="$(mktemp)" tmpdir="$(mktemp -d)" cat > "$rawxmlfile" # Various functions that are handy later scriptdir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" function tex_escape { sed 's|{|\{|g' \ | sed 's|}|\}|g' \ | sed 's|~|\\textasciitilde{}|g' \ | sed 's|\\|\\textbackslash{}|g' \ | sed 's|\^|\\textasciicircum{}|g' \ | sed 's|\#|\\\#|g' \ | sed 's|\$|\\\$|g' \ | sed 's|%|\%|g' \ | sed 's|&|\\&|g' \ | sed 's|_|\_|g' } function tex_escape2 { pandoc --standalone -f html -t latex \ | tail -n +50 | head -n -2 \ | sed 's|\\rule{3in}|\\rule{1in}|g' } # Need to know date, category, title, author, and body suffixes=("th" "st" "nd" "rd" "th" "th" "th" "th" "th" "th") date="$(date '+%A, %B %d, %Y' | sed 's/ 0/ /g')" dayindex="$(date "+%A, %B %d, %Y" | sed 's/ 0/ /g' | awk -F' ' '{print $3}' | tr -d ',' | tail -c 2)" daysuffix=${suffixes[$dayindex]} # Tuesday, October 2nd, 2018 datestring="$(echo "$date" | awk -F ',' "{print \$1 \",\" \$2 \"$daysuffix,\" \$3}")" title_delim="|" # Because we use `cut`, keep this a single character numberofitems="$(xpath sample.xml 'count(/rss/channel/item)')" for (( i=1; $i<=$numberofitems; $((i++)) )) do #echo $i node="/rss/channel/item[$i]" # Collect category="$(cat "$rawxmlfile" | xpath "$node/category/text()")" title="$(cat "$rawxmlfile" | xpath "$node/title/text()")" author="$(cat "$rawxmlfile" | xpath "$node/author/text()" | sed 's/.*(\(.*\))/\1/')" org="NPR" body="$(cat "$rawxmlfile" | xpath "$node/content:encoded/text()")" # Escape category="$(echo "$category" | tex_escape)" title="$(echo "$title" | tex_escape)" author="$(echo "$author" | tex_escape)" org="$(echo "$org" | tex_escape)" body="$(echo "$body" | tex_escape)" # Replace the last word of the body with an \hbox and \textbullet # ... # Mix with tex template texout="$tmpdir/${category}${title_delim}${title}.tex" cat "$scriptdir/standard.tex.template" \ | sed "s|~~category~~|$category|g" \ | sed "s|~~title~~|$title|g" \ | sed "s|~~date~~|$datestring|g" \ | sed "s|~~author~~|$author|g" \ | sed "s|~~body~~|$body|g" \ | sed "s|~~org~~|$org|g" \ > "$texout" pdflatex \ -halt-on-error \ -output-directory "$tmpdir" \ "$texout" done # Create the bookmarks list--count the number of articles in each # category, so we can build a nested list. readarray categories < <(ls $tmpdir/*.pdf | awk -F'/' '{print $NF}' | awk '{print $NF}' | cut -d "$title_delim" -f 1 | sort | uniq) unite_ordered_paths=() page_count=1 for category in ${categories[@]} do article_index=() readarray articles < <(find "$tmpdir" -name "${category}${title_delim}*.tex") echo "Post-read articles" echo "${category}${title_delim}" echo "### Articles:::" echo "$tmpdir" "${category}${title_delim}*.tex" find "$tmpdir" -name "${category}${title_delim}*.tex" printf '%s\n' "${articles[@]}" for article_path in "${articles[@]}" do unite_ordered_paths+=("$article_path") num_pages=$(pdfinfo "$article_path" \ | grep Pages \ | awk '{print $NF}') title=$(echo "$article_path" | awk -F'/' '{print $NF}' | cut -d "$title_delim" -f 2) echo "### title $title" article_index+=("[/Page $pagecount /Title ($title) /OUT pdfmark") pagecount+=$num_pages done echo "### This is the section marker" echo "[/Count ${#articles[@]} /Page $page_count /Title ($category) /OUT pdfmark" done # Process tex file into PDF; stick into output dir pdfunite $tmpdir/*.pdf "/tmp/test.pdf" #rm -rf "$tmpdir" "$rawxmlfile" echo "$tmpdir" # Take all the PDFs and organize them by category, then create the final # PDF # Echo the location of the final pdf