#!/usr/bin/env bash ## NPR to PDF ## This script pulls stories from text.npr.org, and assembles them into ## a PDF suitable for reading offline. topics=( 1003 # National 1004 # World 1006 # Business 1007 # Science 1013 # Education 1017 # Economy 1019 # Technology 1008 # Arts & Life ) tmphtml="$(mktemp).html" trap "rm -f $tmphtml" EXIT tmppdf="/tmp/NPR-$(date "+%Y-%m-%d").pdf" site="https://text.npr.org" function get_topic_stories { tid=$1 turl="$site/t.php?tid=$tid" tpage="$(curl --silent "$turl")" ttitle="$(echo "$tpage" | sed '13p;d' | awk -F'Topic: ' '{print $2}' | awk -F'<' '{print $1}')" readarray sids < <(echo "$tpage" | sed '13p;d' | sed 's/

/\n<\/li>/g' | awk -F'sId=' '{print $2}' | awk -F'[&>]' '{print $1}' | grep -v '^$') echo "

" echo "

$ttitle

" for (( i=0; i<${#sids[@]}; i++ )) do sid="$(echo ${sids[$i]})" spage="$(curl --silent "$site/s.php?sId=$sid")" stitle="$(echo "$spage" | grep '

' | sed '3p;d' | sed 's|

||g' | sed 's|

||g')" sbyline="$(echo "$spage" | grep '

' | sed '4p;d' | sed 's|

||g' | sed 's|

||g')" sbody="$(echo "$spage" | grep '

' | tail -n +5 | head -n -1)" echo "

" echo "

$stitle

" echo "

$sbyline

" echo "$sbody" echo "

" done echo "

" } # Build the document cat >> "$tmphtml" << EOF NPR EOF for tid in ${topics[@]} do get_topic_stories $tid done >> "$tmphtml" cat >> "$tmphtml" << EOF EOF wkhtmltopdf \ --page-width "6.12in" \ --page-height "8.23in" \ --print-media-type \ --margin-top 1in \ --margin-bottom 1in \ --margin-left 1in \ --margin-right 1in \ toc "$tmphtml" "$tmppdf" 2>/dev/null echo "$tmppdf"