第22章 扩展示例:Web客户端编程

播客采集软件通常是让用户通过往他们的配置里添加RSS URL来订阅播客。然后,用户可以定期执行更新操作。播客采集软件将下载RSS文件,检查它们的音频文件引用,并下载该用户之前没有下载过的音频文件。



%本章中的代码是专门为这本书所写,但是是基于为hpodder所写的代码,hpodder是一个用haskell所写的播客采集软件。 hpodder有比在这里展示的例子更多的特性,这使得它太长太复杂难以在这本书中展现。如果您有兴趣学习hpodder,它的源代码是免费提供的,http://software.complete.org/hpodder。




-- file: ch22/PodTypes.hs
module PodTypes where
data Podcast =
    Podcast {castId :: Integer, -- ^ Numeric ID for this podcast
             castURL :: String -- ^ Its feed URL
    deriving (Eq, Show, Read)

data Episode = 
    Episode {epId :: Integer, -- ^ Numeric ID for this episode
             epCast :: Podcast, -- ^ The ID of the podcast it came from
             epURL :: String, -- ^ The download URL for this episode
             epDone :: Bool -- ^ Whether or not we are done with this ep
    deriving (Eq, Show, Read)




-- file: ch22/PodDB.hs
module PodDB where

import Database.HDBC
import Database.HDBC.Sqlite3
import PodTypes
import Control.Monad(when)
import Data.List(sort)

-- | Initialize DB and return database Connection
connect :: FilePath -> IO Connection
connect fp =
    do dbh <- connectSqlite3 fp
       prepDB dbh
       return dbh
{- | Prepare the database for our data.

We create two tables and ask the database engine to verify some pieces
of data consistency for us:

* castid and epid both are unique primary keys and must never be duplicated
* castURL also is unique
* In the episodes table, for a given podcast (epcast), there must be only
  one instance of each given URL or episode ID
prepDB :: IConnection conn => conn -> IO ()
prepDB dbh =
    do tables <- getTables dbh
       when (not ("podcasts" `elem` tables)) $
           do run dbh "CREATE TABLE podcasts (
                       castURL TEXT NOT NULL UNIQUE)" []
              return ()
    when (not ("episodes" `elem` tables)) $
        do run dbh "CREATE TABLE episodes (
                    epcastid INTEGER NOT NULL,
                    epurl TEXT NOT NULL,
                    epdone INTEGER NOT NULL,
                    UNIQUE(epcastid, epurl),
                    UNIQUE(epcastid, epid))" []
           return ()
    commit dbh

{- | Adds a new podcast to the database. Ignores the castid on the
incoming podcast, and returns a new object with the castid populated.
An attempt to add a podcast that already exists is an error. -}
addPodcast :: IConnection conn => conn -> Podcast -> IO Podcast
addPodcast dbh podcast = 
    handleSql errorHandler $
      do -- Insert the castURL into the table. The database
         -- will automatically assign a cast ID.
         run dbh "INSERT INTO podcasts (castURL) VALUES (?)"
             [toSql (castURL podcast)]
         -- Find out the castID for the URL we just added.
         r <- quickQuery' dbh "SELECT castid FROM podcasts WHERE castURL = ?"
              [toSql (castURL podcast)]
         case r of
           [[x]] -> return $ podcast {castId = fromSql x}
           y -> fail $ "addPodcast: unexpected result: " ++ show y
    where errorHandler e = 
               do fail $ "Error adding podcast; does this URL already exist?
                      ++ show e

{- | Adds a new episode to the database. 

Since this is done by automation instead of by user request, we will
simply ignore requests to add duplicate episodes. This way, when we are
processing a feed, each URL encountered can be fed to this function,
without having to first look it up in the DB.

Also, we generally won't care about the new ID here, so don't bother
fetching it. -}
addEpisode :: IConnection conn => conn -> Episode -> IO ()
addEpisode dbh ep =
    run dbh "INSERT OR IGNORE INTO episodes (epCastId, epURL, epDone) 
                VALUES (?, ?, ?)"
                [toSql (castId . epCast $ ep), toSql (epURL ep),
                toSql (epDone ep)]
    >> return ()

{- | Modifies an existing podcast. Looks up the given podcast by
ID and modifies the database record to match the passed Podcast. -}
updatePodcast :: IConnection conn => conn -> Podcast -> IO ()
updatePodcast dbh podcast =
    run dbh "UPDATE podcasts SET castURL = ? WHERE castId = ?" 
            [toSql (castURL podcast), toSql (castId podcast)]
    >> return ()

{- | Modifies an existing episode. Looks it up by ID and modifies the
database record to match the given episode. -}
updateEpisode :: IConnection conn => conn -> Episode -> IO ()
updateEpisode dbh episode =
    run dbh "UPDATE episodes SET epCastId = ?, epURL = ?, epDone = ? 
            WHERE epId = ?"
            [toSql (castId . epCast $ episode),
             toSql (epURL episode),
             toSql (epDone episode),
             toSql (epId episode)]
    >> return ()

{- | Remove a podcast. First removes any episodes that may exist
for this podcast. -}
removePodcast :: IConnection conn => conn -> Podcast -> IO ()
removePodcast dbh podcast =
    do run dbh "DELETE FROM episodes WHERE epcastid = ?" 
         [toSql (castId podcast)]
       run dbh "DELETE FROM podcasts WHERE castid = ?"
         [toSql (castId podcast)]
       return ()

{- | Gets a list of all podcasts. -}
getPodcasts :: IConnection conn => conn -> IO [Podcast]
getPodcasts dbh =
    do res <- quickQuery' dbh 
              "SELECT castid, casturl FROM podcasts ORDER BY castid" []
       return (map convPodcastRow res)

{- | Get a particular podcast. Nothing if the ID doesn't match, or
Just Podcast if it does. -}
getPodcast :: IConnection conn => conn -> Integer -> IO (Maybe Podcast)
getPodcast dbh wantedId =
    do res <- quickQuery' dbh 
              "SELECT castid, casturl FROM podcasts WHERE castid = ?"
              [toSql wantedId]
       case res of
         [x] -> return (Just (convPodcastRow x))
         [] -> return Nothing
         x -> fail $ "Really bad error; more than one podcast with ID"

{- | Convert the result of a SELECT into a Podcast record -}
convPodcastRow :: [SqlValue] -> Podcast
convPodcastRow [svId, svURL] =
    Podcast {castId = fromSql svId,
             castURL = fromSql svURL}
convPodcastRow x = error $ "Can't convert podcast row " ++ show x

{- | Get all episodes for a particular podcast. -}
getPodcastEpisodes :: IConnection conn => conn -> Podcast -> IO [Episode]
getPodcastEpisodes dbh pc =
    do r <- quickQuery' dbh
            "SELECT epId, epURL, epDone FROM episodes WHERE epCastId = ?"
            [toSql (castId pc)]
       return (map convEpisodeRow r)
    where convEpisodeRow [svId, svURL, svDone] =
              Episode {epId = fromSql svId, epURL = fromSql svURL,
                       epDone = fromSql svDone, epCast = pc}


ghci> :load PodDB.hs
[1 of 2] Compiling PodTypes ( PodTypes.hs, interpreted )
[2 of 2] Compiling PodDB ( PodDB.hs, interpreted )
Ok, modules loaded: PodDB, PodTypes.
ghci> dbh <- connect "poddbtest.db"
ghci> :type dbh
dbh :: Connection
ghci> getTables dbh
ghci> let url = "http://feeds.thisamericanlife.org/talpodcast"
ghci> pc <- addPodcast dbh (Podcast {castId=0, castURL=url})
Podcast {castId = 1, castURL = "http://feeds.thisamericanlife.org/talpodcast"}
ghci> getPodcasts dbh
[Podcast {castId = 1, castURL = "http://feeds.thisamericanlife.org/talpodcast"}]
ghci> addEpisode dbh (Episode {epId = 0, epCast = pc, epURL = 
"http://www.example.com/foo.mp3", epDone = False})
ghci> getPodcastEpisodes dbh pc
[Episode {epId = 1, epCast = Podcast {castId = 1, castURL = 
"http://feeds.thisamericanlife.org/talpodcast"}, epURL = "http://www.example.com/foo.mp3", 
epDone = False}]
ghci> commit dbh
ghci> disconnect dbh



<?xml version="1.0" encoding="utf-8"?>
<rss xmlns:itunes="http://www.itunes.com/DTDs/Podcast-1.0.dtd" version="2.0">  
    <title>Haskell Radio</title>  
    <description>Description of this podcast</description>  
      <title>Episode 2: Lambdas</title>  
      <enclosure url="http://www.example.com/radio/lambdas.mp3" type="audio/mpeg" length="10485760"/> 
      <title>Episode 1: Parsec</title>  
      <enclosure url="http://www.example.com/radio/parsec.mp3" type="audio/mpeg" length="10485150"/> 


-- file: ch22/PodParser.hs
module PodParser where

import PodTypes
import Text.XML.HaXml
import Text.XML.HaXml.Parse
import Text.XML.HaXml.Html.Generate(showattr)
import Data.Char
import Data.List

data PodItem = PodItem {itemtitle :: String,
                  enclosureurl :: String
          deriving (Eq, Show, Read)

data Feed = Feed {channeltitle :: String,
                  items :: [PodItem]}
            deriving (Eq, Show, Read)

{- | Given a podcast and an PodItem, produce an Episode -}
item2ep :: Podcast -> PodItem -> Episode
item2ep pc item =
    Episode {epId = 0,
             epCast = pc,
             epURL = enclosureurl item,
             epDone = False}

{- | Parse the data from a given string, with the given name to use
in error messages. -}
parse :: String -> String -> Feed
parse content name = 
    Feed {channeltitle = getTitle doc,
          items = getEnclosures doc}

    where parseResult = xmlParse name (stripUnicodeBOM content)
          doc = getContent parseResult

          getContent :: Document -> Content
          getContent (Document _ _ e _) = CElem e

          {- | Some Unicode documents begin with a binary sequence;
             strip it off before processing. -}
          stripUnicodeBOM :: String -> String
          stripUnicodeBOM ('xef':'xbb':'xbf':x) = x
          stripUnicodeBOM x = x

{- | Pull out the channel part of the document.

Note that HaXml defines CFilter as:

> type CFilter = Content -> [Content]
channel :: CFilter
channel = tag "rss" /> tag "channel"

getTitle :: Content -> String
getTitle doc =
    contentToStringDefault "Untitled Podcast" 
        (channel /> tag "title" /> txt $ doc)

getEnclosures :: Content -> [PodItem]
getEnclosures doc =
    concatMap procPodItem $ getPodItems doc
    where procPodItem :: Content -> [PodItem]
          procPodItem item = concatMap (procEnclosure title) enclosure
              where title = contentToStringDefault "Untitled Episode"
                               (keep /> tag "title" /> txt $ item)
                    enclosure = (keep /> tag "enclosure") item

          getPodItems :: CFilter
          getPodItems = channel /> tag "item"

          procEnclosure :: String -> Content -> [PodItem]
          procEnclosure title enclosure =
              map makePodItem (showattr "url" enclosure)
              where makePodItem :: Content -> PodItem
                    makePodItem x = PodItem {itemtitle = title,
                                       enclosureurl = contentToString [x]}

{- | Convert [Content] to a printable String, with a default if the 
passed-in [Content] is [], signifying a lack of a match. -}
contentToStringDefault :: String -> [Content] -> String
contentToStringDefault msg [] = msg
contentToStringDefault _ x = contentToString x

{- | Convert [Content] to a printable string, taking care to unescape it.

An implementation without unescaping would simply be:

> contentToString = concatMap (show . content)

Because HaXml's unescaping works only on Elements, we must make sure that
whatever Content we have is wrapped in an Element, then use txt to
pull the insides back out. -}
contentToString :: [Content] -> String
contentToString = 
    concatMap procContent
    where procContent x = 
              verbatim $ keep /> txt $ CElem (unesc (fakeElem x))

          fakeElem :: Content -> Element
          fakeElem x = Elem "fake" [] [x]

          unesc :: Element -> Element
          unesc = xmlUnEscape stdXmlEscaper

接下来,是时候解析了。 parse函数需要一个代表XML内容的字符串和一个代表错误消息名的字符串,然后返回一个Feed。

type CFilter = Content -> [Content]

也就是说,一个CFilter需要一个XML文档的一些片段作为参数,并返回0个或多个片段。 CFilter可能被要求找到指定标签的所有孩子,所有标签带有一个确定的名称,这些文本包含了一个XML文档的一部分,或者任何的其他的一些东西。也有运算符(/>)将CFilter函数链接起来。所有我们感兴趣的数据均在<channel>标签当中,所以首先我们要理解,我们定义了一个简单的CFilter:

channel = tag "rss" /> tag "channel"



我们会从HTTP服务器下载,所以我们将使用一个Haskell HTTP库(http://www.haskell.org/http/)。为了下载播客feeds,我们将下载文档,解析它,并更新数据库。对于播放清单中的音频,我们将下载该文件,写入到磁盘上,并在数据库中标出它已下载。下面是代码:

-- file: ch22/PodDownload.hs
module PodDownload where
import PodTypes
import PodDB
import PodParser
import Network.HTTP
import System.IO
import Database.HDBC
import Data.Maybe
import Network.URI

{- | Download a URL. (Left errorMessage) if an error,
(Right doc) if success. -}
downloadURL :: String -> IO (Either String String)
downloadURL url =
    do resp <- simpleHTTP request
       case resp of
         Left x -> return $ Left ("Error connecting: " ++ show x)
         Right r -> 
             case rspCode r of
               (2,_,_) -> return $ Right (rspBody r)
               (3,_,_) -> -- A HTTP redirect
                 case findHeader HdrLocation r of
                   Nothing -> return $ Left (show r)
                   Just url -> downloadURL url
               _ -> return $ Left (show r)
    where request = Request {rqURI = uri,
                             rqMethod = GET,
                             rqHeaders = [],
                             rqBody = ""}
          uri = fromJust $ parseURI url

{- | Update the podcast in the database. -}
updatePodcastFromFeed :: IConnection conn => conn -> Podcast -> IO ()
updatePodcastFromFeed dbh pc =
    do resp <- downloadURL (castURL pc)
       case resp of
         Left x -> putStrLn x
         Right doc -> updateDB doc

    where updateDB doc = 
              do mapM_ (addEpisode dbh) episodes
                 commit dbh
              where feed = parse doc (castURL pc)
                    episodes = map (item2ep pc) (items feed)

{- | Downloads an episode, returning a String representing
the filename it was placed into, or Nothing on error. -}
getEpisode :: IConnection conn => conn -> Episode -> IO (Maybe String)
getEpisode dbh ep =
    do resp <- downloadURL (epURL ep)
       case resp of
         Left x -> do putStrLn x
                      return Nothing
         Right doc -> 
             do file <- openBinaryFile filename WriteMode
                hPutStr file doc
                hClose file
                updateEpisode dbh (ep {epDone = True})
                commit dbh
                return (Just filename)
         -- This function ought to apply an extension based on the file type
    where filename = "pod." ++ (show . castId . epCast $ ep) ++ "." ++ 
                     (show (epId ep)) ++ ".mp3"

这个模块定义了三个函数:downloadURL,简单地下载URL然后将它以String返回;updatePodcastFromFeed,下载一个XML feed文件,解析它,并更新数据库; getEpisode,下载一个给定的播放清单并在数据库中记录。




-- file: ch22/PodMain.hs
module Main where
import PodDownload
import PodDB
import PodTypes
import System.Environment
import Database.HDBC
import Network.Socket(withSocketsDo)

main = withSocketsDo $ handleSqlError $
    do args <- getArgs
       dbh <- connect "pod.db"
       case args of
         ["add", url] -> add dbh url
         ["update"] -> update dbh
         ["download"] -> download dbh
         ["fetch"] -> do update dbh
                         download dbh
         _ -> syntaxError
       disconnect dbh

add dbh url = 
    do addPodcast dbh pc
       commit dbh
    where pc = Podcast {castId = 0, castURL = url}

update dbh = 
    do pclist <- getPodcasts dbh
       mapM_ procPodcast pclist
    where procPodcast pc =
              do putStrLn $ "Updating from " ++ (castURL pc)
                 updatePodcastFromFeed dbh pc

download dbh =
    do pclist <- getPodcasts dbh
       mapM_ procPodcast pclist
    where procPodcast pc =
              do putStrLn $ "Considering " ++ (castURL pc)
                 episodelist <- getPodcastEpisodes dbh pc
                 let dleps = filter (ep -> epDone ep == False)
                 mapM_ procEpisode dleps
          procEpisode ep =
              do putStrLn $ "Downloading " ++ (epURL ep)
                 getEpisode dbh ep

syntaxError = putStrLn 
  "Usage: pod command [args]

  pod add url Adds a new podcast with the given URL

  pod download Downloads all pending episodes

  pod fetch Updates, then downloads

  pod update Downloads podcast feeds, looks for new episodes


ghc --make -O2 -o pod -package HTTP -package HaXml -package network 
    -package HDBC -package HDBC-sqlite3 PodMain.hs 

另外,你可以使用一个Cabal文件,关于它的文档在第131页"Creating a Package"-- ch23/pod.cabal
Name: pod
Version: 1.0.0
Build-type: Simple
Build-Depends: HTTP, HaXml, network, HDBC, HDBC-sqlite3, base
Executable: pod
Main-Is: PodMain.hs
GHC-Options: -O2


import Distribution.Simple
main = defaultMain 


runghc Setup.hs configure
runghc Setup.hs build

然后你会发现一个dist的目录中包含你的输出。在系统中安装程序,运行runghc Setup.hs install。
