Changeset - 9fdbdf39b43b
[Not reviewed]
default
0 2 0
Nathan Brink (binki) - 15 years ago 2010-10-25 23:24:40
ohnobinki@ohnopublishing.net
Move geturi() from calvin's crawler so that all crawlers can use it, renamed to school_crawl_geturi().
2 files changed with 148 insertions and 148 deletions:
0 comments (0 inline, 0 general)
inc/school.crawl.inc
Show inline comments
 
@@ -113,3 +113,148 @@ function school_crawl_days_str_format($d
 

	
 
  return school_crawl_days_format($day_initials);
 
}
 

	
 
/**
 
 * \brief
 
 *   Simulate some aspects of a web browser while retreiving a
 
 *   document.
 
 *
 
 * This allows us to view our cookies in an associative array and to
 
 * have the server's response automatically update our cookies.
 
 *
 
 * If $post is specified as an associative array, an HTTP POST is
 
 * performed and the data is encoded properly as if we were performing
 
 * a form submission.
 
 *
 
 * Follows redirects. If there is a redirect, the page from which you
 
 * are redirected is lost... but few people put any information on
 
 * those pages anyways ;-).
 
 *
 
 * \param $uri
 
 *   The URL to fetch. If a redirect occurs, this is updated.
 
 * \param $cookies
 
 *   An associative array of cookies and where to save new cookies.
 
 * \param $post
 
 *   If not NULL, causes an HTTP POST. In that case, should be an
 
 *   associative array of form keys/values.
 
 * \param $verbosity
 
 *   How verbose to be.
 
 * \param $loopspin
 
 *   An internal variable to prevent us from following perpetual
 
 *   redirects.
 
 * \return
 
 *   The body of the document returned by the server (normally
 
 *   malformed HTML, especially with Calvin's WebAdvisor
 
 *   installation).
 
 */
 
function school_crawl_geturi(&$uri, &$cookies, $post = NULL, $verbosity = 0, $loopspin = 0)
 
{
 
  global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf, $school_crawl_geturi_verbosity;
 

	
 
  if ($verbosity > 5)
 
    {
 
      echo "\n";
 
      echo 'school_crawl_geturi(' . $uri . ")\n";
 
      echo "\n";
 
    }
 

	
 
  $curl = curl_init();
 

	
 
  $school_crawl_geturi_verbosity = $verbosity;
 
  $school_crawl_geturi_write_buf = '';
 
  $school_crawl_geturi_headers_buf = '';
 
  curl_setopt($curl, CURLOPT_URL, $uri);
 

	
 
  $cookies_str = '';
 
  foreach ($cookies as $key => $val)
 
    {
 
      if (strlen($cookies_str))
 
	$cookies_str .= ';';
 
      $cookies_str .= $key . '=' . $val;
 
    }
 

	
 
  if ($verbosity > 8)
 
    echo 'cookies sent: ' . $cookies_str . "\n";
 
  curl_setopt($curl, CURLOPT_COOKIE, $cookies_str);
 
  curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'school_crawl_geturi_header_cb');
 
  curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'school_crawl_geturi_write_cb');
 

	
 
  if ($post != NULL && is_array($post))
 
    {
 

	
 
      /* var_dump($post); */
 

	
 
      $posttxt = '';
 
      foreach ($post as $postkey => $postval)
 
	{
 
	  $posttxt .= (strlen($posttxt) ? '&' : '')
 
	    . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval);
 
	}
 
      if ($verbosity > 8)
 
	echo 'setting POST to ' . $posttxt . "\n";
 

	
 
      /* curl_setopt($curl, CURLOPT_POST, TRUE); */
 
      curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt);
 
    }
 

	
 
  curl_exec($curl);
 
  curl_close($curl);
 

	
 
  $location = NULL;
 
  foreach (explode("\r\n", $school_crawl_geturi_headers_buf) as $header)
 
    {
 
      /*
 
       * yes, we don't want the line if the first char is a ':' or if it has no ':'
 
       */
 
      if (!strpos($header, ':'))
 
	continue;
 
      list($header_name, $header_val) = explode(': ', $header, 2);
 

	
 
      if ($verbosity > 8)
 
	echo $header_name . ' : ' . $header_val . "\n";
 

	
 
      switch($header_name)
 
	{
 
	case 'Set-Cookie':
 
	  list($cookie_name, $cookie_val) = explode('=', $header_val, 2);
 
	  if ($verbosity > 9)
 
	    {
 
	      if (isset($cookies[$cookie_name]))
 
		echo 'Replacing cookie ' . $cookie_name . '=' . $cookies[$cookie_name]
 
		  . ' with ';
 
	      echo 'new cookie ' . $cookie_name . '=' . $cookie_val . "\n";
 
	    }
 
	  $cookies[$cookie_name] = $cookie_val;
 
	  break;
 

	
 
	case 'Location':
 
	  $location = $header_val;
 
	  $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n";
 
	  $post = NULL;
 
	  break;
 
	}
 
    }
 

	
 
  if ($verbosity > 9)
 
    echo $school_crawl_geturi_write_buf;
 
  if ($location && $loopspin < 6)
 
    {
 
      $uri = $location;
 
      return school_crawl_geturi($uri, $cookies, $post, $loopspin + 1);
 
    }
 
  return $school_crawl_geturi_write_buf;
 
}
 

	
 
function school_crawl_geturi_header_cb($curl, $header_buf)
 
{
 
  global $school_crawl_geturi_headers_buf;
 
  $school_crawl_geturi_headers_buf .= $header_buf;
 
  return strlen($header_buf);
 
}
 

	
 
function school_crawl_geturi_write_cb($curl, $write_buf)
 
{
 
  global $school_crawl_geturi_write_buf;
 
  $school_crawl_geturi_write_buf .= $write_buf;
 
  return strlen($write_buf);
 
}
school.d/calvin.crawl.inc
Show inline comments
 
@@ -56,7 +56,7 @@ function calvin_crawl(Semester $semester
 
  $baseuri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 

	
 
  $token_uri = $baseuri . '&TOKENIDX=NULL';
 
  $token_html = calvin_crawl_noscript_filter(geturi($token_uri, $cookies));
 
  $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($token_uri, $cookies));
 
  if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches))
 
    {
 
      fprintf(STDERR, "Could not steal the token\n");
 
@@ -77,7 +77,7 @@ function calvin_crawl(Semester $semester
 
   * individual department for courses.
 
   */
 
  $uri = $baseuri . '&TOKENIDX=' . $token;
 
  $departments_html = calvin_crawl_noscript_filter(geturi($uri, $cookies));
 
  $departments_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies));
 

	
 
  $departments_dom = new DOMDocument();
 
  $departments_dom->loadHTML($departments_html);
 
@@ -229,7 +229,7 @@ function calvin_crawl(Semester $semester
 
  $pages = array(1 => 0, 2=> 1);
 
  while ($pages[1] < $pages[2])
 
    {
 
      $html = calvin_crawl_noscript_filter(geturi($uri, $cookies, $form));
 
      $html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $form));
 

	
 
      $results_dom = new DOMDocument();
 
      $results_dom->loadHTML($html);	
 
@@ -365,151 +365,6 @@ function calvin_crawl(Semester $semester
 

	
 
/**
 
 * \brief
 
 *   Simulate some aspects of a web browser while retreiving a
 
 *   document.
 
 *
 
 * This allows us to view our cookies in an associative array and to
 
 * have the server's response automatically update our cookies.
 
 *
 
 * If $post is specified as an associative array, an HTTP POST is
 
 * performed and the data is encoded properly as if we were performing
 
 * a form submission.
 
 *
 
 * Follows redirects. If there is a redirect, the page from which you
 
 * are redirected is lost... but few people put any information on
 
 * those pages anyways ;-).
 
 *
 
 * \param $uri
 
 *   The URL to fetch. If a redirect occurs, this is updated.
 
 * \param $cookies
 
 *   An associative array of cookies and where to save new cookies.
 
 * \param $post
 
 *   If not NULL, causes an HTTP POST. In that case, should be an
 
 *   associative array of form keys/values.
 
 * \param $verbosity
 
 *   How verbose to be.
 
 * \param $loopspin
 
 *   An internal variable to prevent us from following perpetual
 
 *   redirects.
 
 * \return
 
 *   The body of the document returned by the server (normally
 
 *   malformed HTML, especially with Calvin's WebAdvisor
 
 *   installation).
 
 */
 
function geturi(&$uri, &$cookies, $post = NULL, $verbosity = 0, $loopspin = 0)
 
{
 
  global $geturi_write_buf, $geturi_headers_buf, $geturi_verbosity;
 

	
 
  if ($verbosity > 5)
 
    {
 
      echo "\n";
 
      echo 'geturi(' . $uri . ")\n";
 
      echo "\n";
 
    }
 

	
 
  $curl = curl_init();
 

	
 
  $geturi_verbosity = $verbosity;
 
  $geturi_write_buf = '';
 
  $geturi_headers_buf = '';
 
  curl_setopt($curl, CURLOPT_URL, $uri);
 

	
 
  $cookies_str = '';
 
  foreach ($cookies as $key => $val)
 
    {
 
      if (strlen($cookies_str))
 
	$cookies_str .= ';';
 
      $cookies_str .= $key . '=' . $val;
 
    }
 

	
 
  if ($verbosity > 8)
 
    echo 'cookies sent: ' . $cookies_str . "\n";
 
  curl_setopt($curl, CURLOPT_COOKIE, $cookies_str);
 
  curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'geturi_header_cb');
 
  curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'geturi_write_cb');
 

	
 
  if ($post != NULL && is_array($post))
 
    {
 

	
 
      /* var_dump($post); */
 

	
 
      $posttxt = '';
 
      foreach ($post as $postkey => $postval)
 
	{
 
	  $posttxt .= (strlen($posttxt) ? '&' : '')
 
	    . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval);
 
	}
 
      if ($verbosity > 8)
 
	echo 'setting POST to ' . $posttxt . "\n";
 

	
 
      /* curl_setopt($curl, CURLOPT_POST, TRUE); */
 
      curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt);
 
    }
 

	
 
  curl_exec($curl);
 
  curl_close($curl);
 

	
 
  $location = NULL;
 
  foreach (explode("\r\n", $geturi_headers_buf) as $header)
 
    {
 
      /*
 
       * yes, we don't want the line if the first char is a ':' or if it has no ':'
 
       */
 
      if (!strpos($header, ':'))
 
	continue;
 
      list($header_name, $header_val) = explode(': ', $header, 2);
 

	
 
      if ($verbosity > 8)
 
	echo $header_name . ' : ' . $header_val . "\n";
 

	
 
      switch($header_name)
 
	{
 
	case 'Set-Cookie':
 
	  list($cookie_name, $cookie_val) = explode('=', $header_val, 2);
 
	  if ($verbosity > 9)
 
	    {
 
	      if (isset($cookies[$cookie_name]))
 
		echo 'Replacing cookie ' . $cookie_name . '=' . $cookies[$cookie_name]
 
		  . ' with ';
 
	      echo 'new cookie ' . $cookie_name . '=' . $cookie_val . "\n";
 
	    }
 
	  $cookies[$cookie_name] = $cookie_val;
 
	  break;
 

	
 
	case 'Location':
 
	  $location = $header_val;
 
	  $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n";
 
	  $post = NULL;
 
	  break;
 
	}
 
    }
 

	
 
  if ($verbosity > 9)
 
    echo $geturi_write_buf;
 
  if ($location && $loopspin < 6)
 
    {
 
      $uri = $location;
 
      return geturi($uri, $cookies, $post, $loopspin + 1);
 
    }
 
  return $geturi_write_buf;
 
}
 

	
 
function geturi_header_cb($curl, $header_buf)
 
{
 
  global $geturi_headers_buf;
 
  $geturi_headers_buf .= $header_buf;
 
  return strlen($header_buf);
 
}
 

	
 
function geturi_write_cb($curl, $write_buf)
 
{
 
  global $geturi_write_buf;
 
  $geturi_write_buf .= $write_buf;
 
  return strlen($write_buf);
 
}
 

	
 
/**
 
 * \brief
 
 *   Find an <input /> element and return its value attribute.
 
 *
 
 * \param $domdocument
0 comments (0 inline, 0 general)