Changeset - 9fdbdf39b43b
[Not reviewed]
default
0 2 0
Nathan Brink (binki) - 15 years ago 2010-10-25 23:24:40
ohnobinki@ohnopublishing.net
Move geturi() from calvin's crawler so that all crawlers can use it, renamed to school_crawl_geturi().
2 files changed with 148 insertions and 148 deletions:
0 comments (0 inline, 0 general)
inc/school.crawl.inc
Show inline comments
 
@@ -104,12 +104,157 @@ function school_crawl_days_format($days)
 
 *   Example input: 'mwf', 'TR'.
 
 * \return
 
 *   Same as school_crawl_days_format()
 
 */
 
function school_crawl_days_str_format($days_str)
 
{
 
  $day_initials = array();
 
  for ($i = 0; $i < strlen($days_str); $i ++)
 
    $day_initials[] = $days_str[$i];
 

	
 
  return school_crawl_days_format($day_initials);
 
}
 

	
 
/**
 
 * \brief
 
 *   Simulate some aspects of a web browser while retreiving a
 
 *   document.
 
 *
 
 * This allows us to view our cookies in an associative array and to
 
 * have the server's response automatically update our cookies.
 
 *
 
 * If $post is specified as an associative array, an HTTP POST is
 
 * performed and the data is encoded properly as if we were performing
 
 * a form submission.
 
 *
 
 * Follows redirects. If there is a redirect, the page from which you
 
 * are redirected is lost... but few people put any information on
 
 * those pages anyways ;-).
 
 *
 
 * \param $uri
 
 *   The URL to fetch. If a redirect occurs, this is updated.
 
 * \param $cookies
 
 *   An associative array of cookies and where to save new cookies.
 
 * \param $post
 
 *   If not NULL, causes an HTTP POST. In that case, should be an
 
 *   associative array of form keys/values.
 
 * \param $verbosity
 
 *   How verbose to be.
 
 * \param $loopspin
 
 *   An internal variable to prevent us from following perpetual
 
 *   redirects.
 
 * \return
 
 *   The body of the document returned by the server (normally
 
 *   malformed HTML, especially with Calvin's WebAdvisor
 
 *   installation).
 
 */
 
function school_crawl_geturi(&$uri, &$cookies, $post = NULL, $verbosity = 0, $loopspin = 0)
 
{
 
  global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf, $school_crawl_geturi_verbosity;
 

	
 
  if ($verbosity > 5)
 
    {
 
      echo "\n";
 
      echo 'school_crawl_geturi(' . $uri . ")\n";
 
      echo "\n";
 
    }
 

	
 
  $curl = curl_init();
 

	
 
  $school_crawl_geturi_verbosity = $verbosity;
 
  $school_crawl_geturi_write_buf = '';
 
  $school_crawl_geturi_headers_buf = '';
 
  curl_setopt($curl, CURLOPT_URL, $uri);
 

	
 
  $cookies_str = '';
 
  foreach ($cookies as $key => $val)
 
    {
 
      if (strlen($cookies_str))
 
	$cookies_str .= ';';
 
      $cookies_str .= $key . '=' . $val;
 
    }
 

	
 
  if ($verbosity > 8)
 
    echo 'cookies sent: ' . $cookies_str . "\n";
 
  curl_setopt($curl, CURLOPT_COOKIE, $cookies_str);
 
  curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'school_crawl_geturi_header_cb');
 
  curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'school_crawl_geturi_write_cb');
 

	
 
  if ($post != NULL && is_array($post))
 
    {
 

	
 
      /* var_dump($post); */
 

	
 
      $posttxt = '';
 
      foreach ($post as $postkey => $postval)
 
	{
 
	  $posttxt .= (strlen($posttxt) ? '&' : '')
 
	    . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval);
 
	}
 
      if ($verbosity > 8)
 
	echo 'setting POST to ' . $posttxt . "\n";
 

	
 
      /* curl_setopt($curl, CURLOPT_POST, TRUE); */
 
      curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt);
 
    }
 

	
 
  curl_exec($curl);
 
  curl_close($curl);
 

	
 
  $location = NULL;
 
  foreach (explode("\r\n", $school_crawl_geturi_headers_buf) as $header)
 
    {
 
      /*
 
       * yes, we don't want the line if the first char is a ':' or if it has no ':'
 
       */
 
      if (!strpos($header, ':'))
 
	continue;
 
      list($header_name, $header_val) = explode(': ', $header, 2);
 

	
 
      if ($verbosity > 8)
 
	echo $header_name . ' : ' . $header_val . "\n";
 

	
 
      switch($header_name)
 
	{
 
	case 'Set-Cookie':
 
	  list($cookie_name, $cookie_val) = explode('=', $header_val, 2);
 
	  if ($verbosity > 9)
 
	    {
 
	      if (isset($cookies[$cookie_name]))
 
		echo 'Replacing cookie ' . $cookie_name . '=' . $cookies[$cookie_name]
 
		  . ' with ';
 
	      echo 'new cookie ' . $cookie_name . '=' . $cookie_val . "\n";
 
	    }
 
	  $cookies[$cookie_name] = $cookie_val;
 
	  break;
 

	
 
	case 'Location':
 
	  $location = $header_val;
 
	  $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n";
 
	  $post = NULL;
 
	  break;
 
	}
 
    }
 

	
 
  if ($verbosity > 9)
 
    echo $school_crawl_geturi_write_buf;
 
  if ($location && $loopspin < 6)
 
    {
 
      $uri = $location;
 
      return school_crawl_geturi($uri, $cookies, $post, $loopspin + 1);
 
    }
 
  return $school_crawl_geturi_write_buf;
 
}
 

	
 
function school_crawl_geturi_header_cb($curl, $header_buf)
 
{
 
  global $school_crawl_geturi_headers_buf;
 
  $school_crawl_geturi_headers_buf .= $header_buf;
 
  return strlen($header_buf);
 
}
 

	
 
function school_crawl_geturi_write_cb($curl, $write_buf)
 
{
 
  global $school_crawl_geturi_write_buf;
 
  $school_crawl_geturi_write_buf .= $write_buf;
 
  return strlen($write_buf);
 
}
school.d/calvin.crawl.inc
Show inline comments
 
@@ -47,46 +47,46 @@ function calvin_crawl(Semester $semester
 
   * 2. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL
 
   *    <body onload="javascript:setWindowHTML('', '7699844013');">
 
   *
 
   *    In the above, the second argument to setWindowHTML() is
 
   *    random. Thus, we have to capture this value.
 
   */
 

	
 
  $cookies = array();
 

	
 
  $baseuri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 

	
 
  $token_uri = $baseuri . '&TOKENIDX=NULL';
 
  $token_html = calvin_crawl_noscript_filter(geturi($token_uri, $cookies));
 
  $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($token_uri, $cookies));
 
  if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches))
 
    {
 
      fprintf(STDERR, "Could not steal the token\n");
 
      return 1;
 
    }
 
  $token = $matches[1];
 

	
 
  if ($verbosity > 5)
 
    {
 
      echo 'token: ' . $token . "\n";
 
      echo "\n";
 
    }
 

	
 
  /*
 
   * here we have arrived at the main webadvisor screen which lists the
 
   * search form. From here, we can get a list of all of the departments
 
   * that Calvin College has and then know enough to query each
 
   * individual department for courses.
 
   */
 
  $uri = $baseuri . '&TOKENIDX=' . $token;
 
  $departments_html = calvin_crawl_noscript_filter(geturi($uri, $cookies));
 
  $departments_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies));
 

	
 
  $departments_dom = new DOMDocument();
 
  $departments_dom->loadHTML($departments_html);
 

	
 
  /*
 
   * Discover the available semesters
 
   */
 
  $semesters_select_nodes = $departments_dom->getElementById('VAR1')->childNodes;
 
  $semester_strs = array();
 
  foreach ($semesters_select_nodes as $semester_node)
 
    {
 
      if ($semester_node->tagName != 'option'
 
@@ -220,25 +220,25 @@ function calvin_crawl(Semester $semester
 
   */
 
  /*
 
    for ($day = 10; $day <= 16; $day ++)
 
    $form['VAR' . $day] = '';
 
  */
 

	
 
  /*
 
   * pages is populated by preg_match() below after the first looping.
 
   */
 
  $pages = array(1 => 0, 2=> 1);
 
  while ($pages[1] < $pages[2])
 
    {
 
      $html = calvin_crawl_noscript_filter(geturi($uri, $cookies, $form));
 
      $html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $form));
 

	
 
      $results_dom = new DOMDocument();
 
      $results_dom->loadHTML($html);	
 

	
 
      $list_done = FALSE;
 
      for ($list_row = 1; !$list_done; $list_row ++)
 
	{
 
	  /* either 'Open' (or 'Closed'?) */
 
	  $openness = dom_input_value($results_dom, 'LIST.VAR1_' . $list_row);
 
	  $sec_short_title = dom_id_content($results_dom, 'SEC_SHORT_TITLE_' . $list_row);
 
	  $sec_meeting_info = dom_id_content($results_dom, 'SEC_MEETING_INFO_' . $list_row);
 

	
 
@@ -356,169 +356,24 @@ function calvin_crawl(Semester $semester
 
	if (!$num)
 
	  continue;
 
	if (!$has_stat)
 
	  error_log('Skipped some sections for <reason>: <number skipped>:');
 
	error_log($reason . ': ' . $num);
 
      }
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Simulate some aspects of a web browser while retreiving a
 
 *   document.
 
 *
 
 * This allows us to view our cookies in an associative array and to
 
 * have the server's response automatically update our cookies.
 
 *
 
 * If $post is specified as an associative array, an HTTP POST is
 
 * performed and the data is encoded properly as if we were performing
 
 * a form submission.
 
 *
 
 * Follows redirects. If there is a redirect, the page from which you
 
 * are redirected is lost... but few people put any information on
 
 * those pages anyways ;-).
 
 *
 
 * \param $uri
 
 *   The URL to fetch. If a redirect occurs, this is updated.
 
 * \param $cookies
 
 *   An associative array of cookies and where to save new cookies.
 
 * \param $post
 
 *   If not NULL, causes an HTTP POST. In that case, should be an
 
 *   associative array of form keys/values.
 
 * \param $verbosity
 
 *   How verbose to be.
 
 * \param $loopspin
 
 *   An internal variable to prevent us from following perpetual
 
 *   redirects.
 
 * \return
 
 *   The body of the document returned by the server (normally
 
 *   malformed HTML, especially with Calvin's WebAdvisor
 
 *   installation).
 
 */
 
function geturi(&$uri, &$cookies, $post = NULL, $verbosity = 0, $loopspin = 0)
 
{
 
  global $geturi_write_buf, $geturi_headers_buf, $geturi_verbosity;
 

	
 
  if ($verbosity > 5)
 
    {
 
      echo "\n";
 
      echo 'geturi(' . $uri . ")\n";
 
      echo "\n";
 
    }
 

	
 
  $curl = curl_init();
 

	
 
  $geturi_verbosity = $verbosity;
 
  $geturi_write_buf = '';
 
  $geturi_headers_buf = '';
 
  curl_setopt($curl, CURLOPT_URL, $uri);
 

	
 
  $cookies_str = '';
 
  foreach ($cookies as $key => $val)
 
    {
 
      if (strlen($cookies_str))
 
	$cookies_str .= ';';
 
      $cookies_str .= $key . '=' . $val;
 
    }
 

	
 
  if ($verbosity > 8)
 
    echo 'cookies sent: ' . $cookies_str . "\n";
 
  curl_setopt($curl, CURLOPT_COOKIE, $cookies_str);
 
  curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'geturi_header_cb');
 
  curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'geturi_write_cb');
 

	
 
  if ($post != NULL && is_array($post))
 
    {
 

	
 
      /* var_dump($post); */
 

	
 
      $posttxt = '';
 
      foreach ($post as $postkey => $postval)
 
	{
 
	  $posttxt .= (strlen($posttxt) ? '&' : '')
 
	    . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval);
 
	}
 
      if ($verbosity > 8)
 
	echo 'setting POST to ' . $posttxt . "\n";
 

	
 
      /* curl_setopt($curl, CURLOPT_POST, TRUE); */
 
      curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt);
 
    }
 

	
 
  curl_exec($curl);
 
  curl_close($curl);
 

	
 
  $location = NULL;
 
  foreach (explode("\r\n", $geturi_headers_buf) as $header)
 
    {
 
      /*
 
       * yes, we don't want the line if the first char is a ':' or if it has no ':'
 
       */
 
      if (!strpos($header, ':'))
 
	continue;
 
      list($header_name, $header_val) = explode(': ', $header, 2);
 

	
 
      if ($verbosity > 8)
 
	echo $header_name . ' : ' . $header_val . "\n";
 

	
 
      switch($header_name)
 
	{
 
	case 'Set-Cookie':
 
	  list($cookie_name, $cookie_val) = explode('=', $header_val, 2);
 
	  if ($verbosity > 9)
 
	    {
 
	      if (isset($cookies[$cookie_name]))
 
		echo 'Replacing cookie ' . $cookie_name . '=' . $cookies[$cookie_name]
 
		  . ' with ';
 
	      echo 'new cookie ' . $cookie_name . '=' . $cookie_val . "\n";
 
	    }
 
	  $cookies[$cookie_name] = $cookie_val;
 
	  break;
 

	
 
	case 'Location':
 
	  $location = $header_val;
 
	  $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n";
 
	  $post = NULL;
 
	  break;
 
	}
 
    }
 

	
 
  if ($verbosity > 9)
 
    echo $geturi_write_buf;
 
  if ($location && $loopspin < 6)
 
    {
 
      $uri = $location;
 
      return geturi($uri, $cookies, $post, $loopspin + 1);
 
    }
 
  return $geturi_write_buf;
 
}
 

	
 
function geturi_header_cb($curl, $header_buf)
 
{
 
  global $geturi_headers_buf;
 
  $geturi_headers_buf .= $header_buf;
 
  return strlen($header_buf);
 
}
 

	
 
function geturi_write_cb($curl, $write_buf)
 
{
 
  global $geturi_write_buf;
 
  $geturi_write_buf .= $write_buf;
 
  return strlen($write_buf);
 
}
 

	
 
/**
 
 * \brief
 
 *   Find an <input /> element and return its value attribute.
 
 *
 
 * \param $domdocument
 
 *   The DOMDocument to search.
 
 * \param $name
 
 *   The name attribute of the <input /> element.
 
 * \return
 
 *   The value attribute of the input element or NULL if not found.
 
 */
 
function dom_input_value($domdocument, $name)
 
{
 
  $xpath = new DOMXPath($domdocument);
0 comments (0 inline, 0 general)