#!/usr/bin/env bash ## NPR to RSS topics=( 1003 # National 1004 # World 1006 # Business 1007 # Science 1013 # Education 1017 # Economy 1019 # Technology 1008 # Arts & Life ) tmptex="$(mktemp).tex" trap "rm -f $tmptex" EXIT tmppdf="/tmp/NPR-$(date "+%Y-%m-%d").pdf" site="https://text.npr.org" function tex_escape { sed 's|{|\{|g' \ | sed 's|}|\}|g' \ | sed 's|~|\\textasciitilde{}|g' \ | sed 's|\\|\\textbackslash{}|g' \ | sed 's|\^|\\textasciicircum{}|g' \ | sed 's|\#|\\\#|g' \ | sed 's|\$|\\\$|g' \ | sed 's|%|\%|g' \ | sed 's|&|\\&|g' \ | sed 's|_|\_|g' } function bodyfix { sed 's|.*· |

|g' } function to_tex { pandoc --standalone -f html -t latex | tail -n +50 | head -n -2 \ | sed 's|\\rule{3in}|\\rule{1in}|g' } function html_encode { sed 's/&/\&/g; s//\>/g; s/"/\"/g; s/'"'"'/\'/g' } function get_topic_stories { tid=$1 turl="$site/t.php?tid=$tid" tpage="$(curl --silent "$turl")" ttitle="$(echo "$tpage" | sed '13p;d' | awk -F'Topic: ' '{print $2}' | awk -F'<' '{print $1}' | html_encode)" readarray sids < <(echo "$tpage" | sed '13p;d' | sed 's/

  • /\n<\/li>/g' | awk -F'sId=' '{print $2}' | awk -F'[&>]' '{print $1}' | grep -v '^$') for (( i=0; i<${#sids[@]}; i++ )) do sid="$(echo ${sids[$i]})" spage="$(curl --silent "$site/s.php?sId=$sid")" stitle="$(echo "$spage" | grep '

    ' | sed '3p;d' | sed 's|

    ||g' | sed 's|

    ||g' | html_encode)" sbyline="$(echo "$spage" | grep '

    ' | sed '4p;d' | sed 's|

    ||g' | sed 's|

    ||g' | html_encode)" sbody="$(echo "$spage" | bodyfix | grep '

    ' | tail -n +5 | head -n -1)" if [[ "$(echo "$byline" | cut -d ' ' -f 1 )" != "By" ]] then sbyline="Program Error" fi # If there isn't much of a body, skip it if [[ 5 -gt $(echo "$sbody" | wc -l) ]] then continue fi # Put it into RSS echo "" echo "$stitle" echo "$(echo "$sbody" | base64)" echo "Tue, 02 Oct 2018 19:20:00 -0400" echo "not-used@example.com ($sbyline)" echo "$ttitle" echo "" done } # Bulid the document cat < NPR http://this-is-a-link-to-the-channel-page Description of the channel en Copyright 2018 NPR RSS to LaTeX d@visr.me (Davis Remmel) Tue, 02 Oct 2018 19:20:00 -0400 EOF # Items in the middle for tid in ${topics[@]} do get_topic_stories $tid done # Footer cat < EOF