#!/usr/bin/env bash ## NPR to PDF ## This script pulls stories from text.npr.org, and assembles them into ## a PDF suitable for reading offline. topics=( 1003 # National 1004 # World 1006 # Business 1007 # Science 1013 # Education 1017 # Economy 1019 # Technology 1008 # Arts & Life ) tmphtml="$(mktemp).html" trap "rm -f $tmphtml" EXIT tmppdf="/tmp/NPR-$(date "+%Y-%m-%d").pdf" site="https://text.npr.org" function get_topic_stories { tid=$1 turl="$site/t.php?tid=$tid" tpage="$(curl --silent "$turl")" ttitle="$(echo "$tpage" | sed '13p;d' | awk -F'Topic: ' '{print $2}' | awk -F'<' '{print $1}')" readarray sids < <(echo "$tpage" | sed '13p;d' | sed 's/
  • /\n<\/li>/g' | awk -F'sId=' '{print $2}' | awk -F'[&>]' '{print $1}' | grep -v '^$') echo "
    " echo "

    $ttitle

    " for (( i=0; i<${#sids[@]}; i++ )) do sid="$(echo ${sids[$i]})" spage="$(curl --silent "$site/s.php?sId=$sid")" stitle="$(echo "$spage" | grep '

    ' | sed '3p;d' | sed 's|

    ||g' | sed 's|

    ||g')" sbyline="$(echo "$spage" | grep '

    ' | sed '4p;d' | sed 's|

    ||g' | sed 's|

    ||g')" sbody="$(echo "$spage" | grep '

    ' | tail -n +5 | head -n -1)" echo "

    " echo "

    $stitle

    " echo "

    $sbyline

    " echo "$sbody" echo "
    " done echo "
    " } # Build the document cat >> "$tmphtml" << EOF NPR EOF for tid in ${topics[@]} do get_topic_stories $tid done >> "$tmphtml" cat >> "$tmphtml" << EOF EOF wkhtmltopdf \ --page-width "6.12in" \ --page-height "8.23in" \ --print-media-type \ --margin-top 1in \ --margin-bottom 1in \ --margin-left 1in \ --margin-right 1in \ toc "$tmphtml" "$tmppdf" 2>/dev/null echo "$tmppdf"