#!/usr/bin/env bash ## NPR to RSS topics=( 1003 # National 1004 # World 1006 # Business 1007 # Science 1013 # Education 1017 # Economy 1019 # Technology 1008 # Arts & Life ) tmptex="$(mktemp).tex" trap "rm -f $tmptex" EXIT tmppdf="/tmp/NPR-$(date "+%Y-%m-%d").pdf" site="https://text.npr.org" function tex_escape { sed 's|{|\{|g' \ | sed 's|}|\}|g' \ | sed 's|~|\\textasciitilde{}|g' \ | sed 's|\\|\\textbackslash{}|g' \ | sed 's|\^|\\textasciicircum{}|g' \ | sed 's|\#|\\\#|g' \ | sed 's|\$|\\\$|g' \ | sed 's|%|\%|g' \ | sed 's|&|\\&|g' \ | sed 's|_|\_|g' } function bodyfix { sed 's|.*· |
|g' } function to_tex { pandoc --standalone -f html -t latex | tail -n +50 | head -n -2 \ | sed 's|\\rule{3in}|\\rule{1in}|g' } function html_encode { sed 's/&/\&/g; s/\</g; s/>/\>/g; s/"/\"/g; s/'"'"'/\'/g' } function get_topic_stories { tid=$1 turl="$site/t.php?tid=$tid" tpage="$(curl --silent "$turl")" ttitle="$(echo "$tpage" | sed '13p;d' | awk -F'Topic: ' '{print $2}' | awk -F'<' '{print $1}' | html_encode)" readarray sids < <(echo "$tpage" | sed '13p;d' | sed 's/
' | sed '3p;d' | sed 's|
||g' | sed 's|
||g' | html_encode)" sbyline="$(echo "$spage" | grep '' | sed '4p;d' | sed 's|
||g' | sed 's|
||g' | html_encode)" sbody="$(echo "$spage" | bodyfix | grep '' | tail -n +5 | head -n -1)"
if [[ "$(echo "$byline" | cut -d ' ' -f 1 )" != "By" ]]
then
sbyline="Program Error"
fi
# If there isn't much of a body, skip it
if [[ 5 -gt $(echo "$sbody" | wc -l) ]]
then
continue
fi
# Put it into RSS
echo "