SlatePermutate Files · inc/school.crawl.inc

Files @ 318a910b91ad
Branch filter:
Location: SlatePermutate/inc/school.crawl.inc

318a910b91ad 17.2 KiB text/x-povray Show Annotation Show as Raw Download as Raw
binki
Now instead of automatically guessing that a user want to register for the next semester, assume they want to register for a semester whose middle is half a year into the future.
<?php
/*
 * Copyright 2010 Nathan Phillip Brink <ohnobinki@ohnopublishing.net>
 *
 * This file is a part of slate_permutate.
 *
 * slate_permutate is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * slate_permutate is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with slate_permutate.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * \file
 *   Routines that are only useful when crawling schools' websites for
 *   autofill section data.
 */

/**
 * \brief
 *   Initialize a school_crawl_log handle.
 *
 * \param $school
 *   The school for which this crawl handle is.
 * \param $opts
 *   An array optionally with one of the following keys:
 *   - stream: an fopen()-compatible stream to fwrite()/fprintf() output to.
 *   - page: a Page object used to help format HTML output.
 *   - verbosity: A number from 0 through 10 describing the desired
 *       verbosity.
 */
function school_crawl_log_init(array $school, $opts = array())
{
  $opts += array('verbosity' => 5);
  return array('school' => $school, 'out' => array('html' => array(), 'plain' => array())) + $opts;
}

/**
 * \brief
 *   Log progress of a crawler.
 *
 * This function's arguments take the same style as fprintf() does.
 *
 * \param $school_crawl_log
 *   The logging resource.
 * \param $verboseness
 *   The verbosity level at which to log the message. Should be a
 *   value from 0 to 10, where 0 is unconditionally printed and 5 is
 *   the default.
 * \param $format
 *   The printf()-style format string.
 */
function school_crawl_logf(array $school_crawl_log, $verboseness, $format)
{
  $args = func_get_args();
  array_shift($args);
  array_shift($args);

  if ($verboseness > $school_crawl_log['verbosity'])
    /*
     * The given message gives us more detail than we want. Therefore,
     * discard it.
     */
    return;

  $log_line = call_user_func_array('sprintf', $args);

  /* store output in a place where it's retrievable */
  $school_crawl_log['out']['plain'][] = sprintf("%s_crawl(): %s\n",
						$school_crawl_log['school']['id'], $log_line);

  /* store the output in a retrievable list of outputs */
  if (isset($school_crawl_log['page']))
    $school_crawl_log['out']['html'][] = sprintf("<div class=\"logline\"><tt>%s_crawl()</tt>: %s</div><br class=\"logline\"%s>\n",
						 $school_crawl_log['school']['id'], htmlentities($log_line),
						 $school_crawl_log['page']->element_self_close());

  /* print to a stream potentially */
  if (isset($school_crawl_log['stream']))
    fprintf($school_crawl_log['stream'], "%s_crawl(): %s\n", $school_crawl_log['school']['id'], $log_line);

  return 0;
}

/**
 * \brief
 *   Recover stored crawling log stuffage.
 *
 * \param $html
 *   Whether to retrieve formatted HTML output if it's available.
 * \return
 *   An array of output lines.
 */
function school_crawl_log_fetch(array $school_crawl_log, $html = FALSE)
{
  if ($html)
    if (isset($school_crawl_log['page']))
      return $school_crawl_log['out']['html'];
    else
      return nl2br(htmlentities($school_crawl_log['out']['plain']));
  return $school_crawl_log['out']['plain'];
}

/**
 * \brief
 *   Parse a simple time string into slate_permutate's time
 *   representation.
 *
 * \param $time
 *   An array compatible with the return value of strptime(). The only
 *   fields we use are 'tm_hour', which is from 0 through 23, and
 *   'tm_min', which may be from 0 through 50.
 */
function school_crawl_time_format($time)
{
  return sprintf('%02d%02d', $time['tm_hour'], $time['tm_min']);
}

/**
 * \brief
 *   Equivalent of mktime() except that it accepts strptime()'s output
 *   format as an input.
 *
 * \param $tm
 *   An array formatted as the output of strptime().
 * \return
 *   A unix timestamp.
 */
function school_crawl_mktime(array $tm)
{
  return mktime($tm['tm_hour'], $tm['tm_min'], $tm['tm_sec'],
		$tm['tm_mon'] + 1, $tm['tm_mday'], $tm['tm_year'] + 1900);
}

/**
 * \brief
 *   Take an array of day names and assemble them into
 *   slate_permutate's internal (weird) representation of a set of
 *   weekdays.
 *
 * This function is intended to make it easy for one to take the
 * output of an explode() call. For example, to decode $days_str =
 * 'Monday, Tuesday, Friday', one would do
 * school_crawl_days_format(explode(', ', $days_str));
 *
 * \param $days
 *   An array of day names. These may be common abbreviations or
 *   truncations (any truncations must be two chars long for
 *   simplicity. One-char representations are supported, however, but
 *   use 'm', 't', 'w', 'h', 'f' to distinguish Thursday and
 *   Tuesday. 'r' may also be used for Thursday.). Case does not
 *   matter. 's' is for Saturday, based on CCBCMD.
 * \return
 *   slate_permutate's strange internal days representation.
 */
function school_crawl_days_format($days)
{
  static $daymap_1 = array('m' => 'm', 't' => 't', 'w' => 'w', 'h' => 'h', 'r' => 'h', 'f' => 'f', 's' => 's');
  static $daymap_2 = array('th' => 'h');

  $my_days = array();
  foreach ($days as $day)
    {
      $day_orig = $day;
      $day = strtolower(substr(trim($day), 0, 2));

      /*
       * convert from two-char representation to one-char
       * representation.n
       */
      if (strlen($day) > 1)
	{
	  if (isset($daymap_2[$day]))
	    $day = $daymap_2[$day];
	  else
	    $day = substr($day, 0, 1);
	}
      if (isset($daymap_1[$day]))
	$my_days[$daymap_1[$day]] = TRUE;
      else
	error_log('school_crawl_days_format() got invalid day specifier:'
		  . ' `' . $day_orig . '\' => `' . $day . '\'');
    }

  $day_str = '';
  foreach ($my_days as $day_val => $junk)
    $day_str .= $day_val;

  return $day_str;
}

/**
 * \brief
 *   Take a string of day initials and format it.
 *
 * \param $days_str
 *   Example input: 'mwf', 'TR'.
 * \return
 *   Same as school_crawl_days_format()
 */
function school_crawl_days_str_format($days_str)
{
  $day_initials = array();
  for ($i = 0; $i < strlen($days_str); $i ++)
    $day_initials[] = $days_str[$i];

  return school_crawl_days_format($day_initials);
}

/**
 * \brief
 *   Simulate some aspects of a web browser while retreiving a
 *   document.
 *
 * This allows us to view our cookies in an associative array and to
 * have the server's response automatically update our cookies.
 *
 * If $post is specified as an associative array, an HTTP POST is
 * performed and the data is encoded properly as if we were performing
 * a form submission.
 *
 * Follows redirects. If there is a redirect, the page from which you
 * are redirected is lost... but few people put any information on
 * those pages anyways ;-).
 *
 * \param $uri
 *   The URL to fetch. If a redirect occurs, this is updated.
 * \param $cookies
 *   An associative array of cookies and where to save new cookies.
 * \param $school_crawl_log
 *   The school_crawl_log handle to use.
 * \param $post
 *   If not NULL, causes an HTTP POST. In that case, should be an
 *   associative array of form keys/values.
 * \param $follow_meta_refresh
 *   Parse the resultant HTML with http://docs.php.net/dom and if it
 *   contains a line that looks like ``<meta http-equiv="Refresh" content="0; url=https://simon.ccbcmd.edu/pls/PROD/bwckschd.p_disp_dyn_sched">'',
 *   follow that URL.
 * \param $curlsetup_hook
 *   A function which is passed a curl handle which allows the caller
 *   to do silly things like setting CURLOPT_SSLVERSION for silly
 *   sites like ccbcmd's registration site.
 * \param $loopspin
 *   An internal variable to prevent us from following perpetual
 *   redirects.
 * \return
 *   The body of the document returned by the server (normally
 *   malformed HTML, especially with Calvin's WebAdvisor
 *   installation).
 */
function school_crawl_geturi(&$uri, &$cookies, array &$school_crawl_log, $post = NULL, $follow_meta_refresh = FALSE, $curlsetup_hook = NULL, $loopspin = 0)
{
  global $school_crawl_geturi_write_buf, $school_crawl_geturi_headers_buf;

  school_crawl_logf($school_crawl_log, 7, "school_crawl_geturi('%s').", $uri);

  $curl = curl_init();

  if ($curlsetup_hook !== NULL)
    $curlsetup_hook($curl);

  $school_crawl_geturi_write_buf = '';
  $school_crawl_geturi_headers_buf = '';
  curl_setopt($curl, CURLOPT_URL, $uri);

  $cookies_str = '';
  foreach ($cookies as $key => $val)
    {
      if (strlen($cookies_str))
	$cookies_str .= ';';
      $cookies_str .= $key . '=' . $val;
    }

  school_crawl_logf($school_crawl_log, 10, "cookies sent: %s", $cookies_str);
  curl_setopt($curl, CURLOPT_COOKIE, $cookies_str);
  curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'school_crawl_geturi_header_cb');
  curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'school_crawl_geturi_write_cb');

  if ($post != NULL && is_array($post))
    {

      /* var_dump($post); */

      $posttxt = '';
      foreach ($post as $postkey => $postvals)
	{
	  /*
	   * This not escaping MEMBER thing is Calvin-specific
	   * too. Maybe we need a way to ask for some particular char
	   * not to be encoded?
	   */

	  /*
	   * Apparently, browsers like seamonkey will send multiple
	   * versions of <input type="hidden" name="field" value="1"
	   * /> if another input exists with name="field", like:
	   * field=1&field=blah. It seems like the webserver for
	   * ccbcmd cares about having these multiple values too...
	   *
	   * Yes, sending subj_sel=dummy&subj_sel=%25 made _all_ of
	   * the difference. Wow.
	   */
	  if (!is_array($postvals))
	    $postvals = array($postvals);
	  foreach ($postvals as $postval)
	    $posttxt .= (strlen($posttxt) ? '&' : '')
	    . urlencode($postkey) . '=' . (strpos($postkey, 'MEMBER') === FALSE ? urlencode($postval) : $postval);
	}
      school_crawl_logf($school_crawl_log, 10, "Setting POST to %s", $posttxt);

      /* curl_setopt($curl, CURLOPT_POST, TRUE); */
      curl_setopt($curl, CURLOPT_POSTFIELDS, $posttxt);
    }

  curl_exec($curl);
  curl_close($curl);

  $location = NULL;
  foreach (explode("\r\n", $school_crawl_geturi_headers_buf) as $header)
    {
      /*
       * yes, we don't want the line if the first char is a ':' or if it has no ':'
       */
      if (!strpos($header, ':'))
	continue;
      list($header_name, $header_val) = explode(': ', $header, 2);

      school_crawl_logf($school_crawl_log, 9, "%s: %s", $header_name, $header_val);

      switch($header_name)
	{
	case 'Set-Cookie':
	  list($cookie_name, $cookie_val) = explode('=', $header_val, 2);
	  if (isset($cookies[$cookie_name]))
	    school_crawl_logf($school_crawl_log, 10, "Replacing cookie %s=%s with...", $cookie_name, $cookies[$cookie_name]);
	  school_crawl_logf($school_crawl_log, 10, "...new cookie %s=%s.", $cookie_name, $cookie_val);
	  $cookies[$cookie_name] = $cookie_val;
	  break;

	case 'Location':
	  $location = $header_val;
	  /* yes, a calvin-specific replacement :-/ */
	  $location = preg_replace(';(kvdata\.calvin\.edu/)(WebAdvisor);', '\1walive/\2', $location) . "\n";
	  $post = NULL;
	  break;
	}
    }

  if ($follow_meta_refresh)
    {
      $dom = new DOMDocument();
      $dom->loadHTML($school_crawl_geturi_write_buf);
      foreach ($dom->getElementsByTagName('meta') as $meta_node)
	if ($meta_node->hasAttribute('http-equiv')
	    && !strcasecmp('refresh', $meta_node->getAttribute('http-equiv')))
	  {
	    $meta_content = $meta_node->getAttribute('content');
	    school_crawl_logf($school_crawl_log, 7, "Following http-equiv Refresh: %s", $meta_content);
	    if (!(preg_match('/^[0-9]+; *url=(.*)$/', $meta_content, $meta_matches)))
	      {
		school_crawl_logf($school_crawl_log, 0, "Error following http-equiv Refresh: %s", $meta_content);
	      }
	    else
	      {
		$location = $meta_matches[1];
		$post = NULL;
	      }
	  }
    }

  school_crawl_logf($school_crawl_log, 10, "%s", $school_crawl_geturi_write_buf);
  if ($location && $loopspin < 6)
    {
      $uri = $location;
      return school_crawl_geturi($uri, $cookies, $school_crawl_log, $post, $follow_meta_refresh, $curlsetup_hook, $loopspin + 1);
    }
  return $school_crawl_geturi_write_buf;
}

function school_crawl_geturi_header_cb($curl, $header_buf)
{
  global $school_crawl_geturi_headers_buf;
  $school_crawl_geturi_headers_buf .= $header_buf;
  return strlen($header_buf);
}

function school_crawl_geturi_write_cb($curl, $write_buf)
{
  global $school_crawl_geturi_write_buf;
  $school_crawl_geturi_write_buf .= $write_buf;
  return strlen($write_buf);
}

/**
 * \brief
 *   Finds the closest parent of a DOM element with a certain tag
 *   name.
 *
 * Useful for finding the <form /> element associated with a given
 * <select /> or set of <input />s so that the form's action=""
 * parameter may be found.
 *
 * The node itself passed in will be considered for whether or not it
 * matches the $element_name.
 *
 * \param $node
 *   The dom node whose ancestor should be found.
 * \param $element_name
 *   The name of the ancestor element which is requested.
 * \return
 *   The DOMElement sought or NULL if not found.
 */
function school_crawl_element_ancestor(DOMElement $node, $element_name)
{
  if (!strcmp($node->tagName, $element_name))
    return $node;
  if ($node->parentNode)
    return school_crawl_element_ancestor($node->parentNode, $element_name);
  return NULL;
}

/**
 * \brief
 *   Create an array based on an HTML form for submitting the form.
 *
 * Currently, this will only support the <input /> and <select />
 * elements.
 *
 * \param $form_node
 *   The dom node of the form.
 * \return
 *   An array suitable for passing to school_crawl_geturi().
 */
function school_crawl_form(DOMElement $form_node)
{
  $form = array();

  $xpath = new DOMXPath($form_node->ownerDocument);
  foreach ($xpath->query('input', $form_node) as $input_node)
    {
      if ($input_node->hasAttribute('name'))
	{
	  $input_name = $input_node->getAttribute('name');
	  if (!isset($form[$input_name]))
	    $form[$input_name] = array();
	  if ($input_node->hasAttribute('value'))
	    $form[$input_name][] = $input_node->getAttribute('value');
	  else
	    /* not sure about what best to do in this case... */
	    $form[$input_name][] = '';
	}
    }

  foreach ($xpath->query('select', $form_node) as $select_node)
    {
      if ($select_node->hasAttribute('name'))
	{
	  $select_name = $select_node->getAttribute('name');
	  if (!isset($form[$select_name]))
	    $form[$select_name] = array();
	  foreach ($xpath->query('option[selected]', $select_node) as $option_node)
	    if ($option_node->hasAttribute('value'))
	      $form[$select_name][] = $option_node->getAttribute('value');
	}
    }

  return $form;
}

/**
 * \brief
 *   Resolve a relativish URL.
 *
 * \param $orig_url
 *   The original URL.
 * \param $url
 *   The new URL to be reconciled with the original one.
 * \return
 *   A string, the new URL.
 */
function school_crawl_url($orig_url, $url)
{
  /*
   * This accounts for both if the $url is already an absolute, fully
   * qualified URL. It falls back to the original URL if it fails to
   * match.
   */
  foreach (array($url, $orig_url) as $aurl)
    if (preg_match(';^(https?)://([^/]+)(/.*)$;', $aurl, $matches))
      {
	$new_url['schema'] = $matches[1];
	$new_url['hostname'] = $matches[2];
	$new_url['path'] = $matches[3];
      }

  /* check if we have an absolute relative path */
  if (!strncmp($url, '/', 1))
    $new_url['path'] = $url;

  /* relative */
  while (!strncmp($url, '../', 3))
    {
      $new_url['path'] = preg_replace(';[^/]+/[^/]+$;', '/', dirname($new_url['path']));
      $url = substr($url, 3);
    }

  return $new_url['schema'] . '://' . $new_url['hostname'] . $new_url['path'];
}

/**
 * \brief
 *   Map a name onto a column of the table with the help of <th />.
 *
 * This should be a quite reliable way of matching the data that a
 * user sees onto the actual data because, in most cases, HTML writers
 * are forced to properly align <th /> and the following hundreds of
 * <td />s for there to be a visual alignment.
 *
 * \param $tr_node
 *   The <tr /> with the <th /> elements to resolve.
 * \param $column_name
 *   The name of the column to search for.
 * \param $strcmp
 *   The function to use with a strcmp() interface when judging
 *   whether or not a <th />'s textContent matches $column_name.
 * \param $trim
 *   The function to apply to the <th />'s textContent before
 *   subjecting it to the $strcmp test.
 * \return
 *   The 0-based index of the column offset or FALSE if the item isn't
 *   found. This index ignores the existence of text elements, so be
 *   careful in using the result.
 */
function school_crawl_table_resolve_column(DOMElement $tr_node, $column_name, $strcmp = 'strcasecmp', $trim = 'trim')
{
  $th_nodelist = school_crawl_table_rownodes($tr_node);
  for ($i = 0; $i < $th_nodelist->length; $i ++)
    if (!$strcmp($column_name, $trim($th_nodelist->item($i)->textContent)))
      return $i;
  return FALSE;
}

/**
 * \brief
 *   Get a DOMNodeList of a row's elements without #text elements in
 *   the way.
 *
 * Helpful when using school_crawl_table_resolve_column() to get data.
 */
function school_crawl_table_rownodes(DOMElement $tr_node)
{
  $xpath = new DOMXPath($tr_node->ownerDocument);
  return $xpath->query('descendant::*[self::th or self::td]', $tr_node);
}