How to create a RSS feed from an HTML page in Python?

by Martin Monperrus

Here is an example program to create a RSS feed from an HTML page in Python.

You can comment the program below :-)

–Martin

  #!/usr/bin/python
  # coding: utf-8
  """ 
  Outputs an RSS feed from an HTML page

  Martin Monperrus
  June 2015
  """

  from lxml import etree
  import feedgenerator
  import requests
  import os

  # fetching the html page
  response = requests.get('https://www.galaxie.enseignementsup-recherche.gouv.fr/ensup/ListesPostesPublies/Emplois_publies_TrieParCorps.html')
  doc = response.text

  # getting the items 
  tree = etree.HTML(doc)
  items = tree.xpath('//table/tr/td/table/tr')

  # creating a feed
  feed = feedgenerator.Rss201rev2Feed(title="Foo",
          link="https://foo/bar",
          description="Foo",
          language="fr")

  # for each line in the table
  for i in items:
    # getting the identifier
    ids = i.xpath('td[position()=1]/text()')
    idposte = 'empty' if len(ids) == 0 else ids[0]

    # getting the link
    links = i.xpath('td[position()=3]/a/@href')
    link = 'empty' if len(links) == 0 else links[0]

    # getting the description
    descriptions = i.xpath('td[position()=2]/text()')
    description = 'empty' if len(descriptions) == 0 else descriptions[0]
    
    # getting the title
    titles = i.xpath('td[position()=4]/text()')
    title = 'empty' if len(titles) == 0 else titles[0]
    
    feed.add_item(
          title=title,
          link=link,
          description=description,
          unique_id=idposte
        )
    
  print "Content-type: application/rss+xml; charset=utf8"
  print ""
  print feed.writeString('utf-8')
Tagged as: