Changeset - 3487a5b3cbfd
[Not reviewed]
default
0 1 0
Nathan Brink (binki) - 13 years ago 2012-10-01 20:19:09
ohnobinki@ohnopublishing.net
Fix some crawler utility functions to handle relative URI resolving correctly and be more flexible when parsing days.
1 file changed with 13 insertions and 4 deletions:
0 comments (0 inline, 0 general)
inc/school.crawl.inc
Show inline comments
 
@@ -197,53 +197,60 @@ function school_crawl_days_format(array 
 
	school_crawl_logf($school_crawl_log, 5, "school_crawl_days_format() got invalid day specifier: `%s' => `%s'.",
 
			  $day_orig, $day);
 
    }
 

	
 
  $day_str = '';
 
  foreach ($my_days as $day_val => $junk)
 
    $day_str .= $day_val;
 

	
 
  return $day_str;
 
}
 

	
 
/**
 
 * \brief
 
 *   Take a string of day initials and format it.
 
 *
 
 * \param $school_crawl_log
 
 *   The school_crawl_log handle to write errors out to.
 
 * \param $days_str
 
 *   Example input: 'mwf', 'TR'.
 
 * \return
 
 *   Same as school_crawl_days_format()
 
 */
 
function school_crawl_days_str_format(array $school_crawl_log, $days_str)
 
{
 
  $day_initials = array();
 
  for ($i = 0; $i < strlen($days_str); $i ++)
 
    $day_initials[] = $days_str[$i];
 
  $days_str = preg_replace('/[^[:alnum:]]+/', '', $days_str);
 

	
 
  return school_crawl_days_format($school_crawl_log, $day_initials);
 
  /*
 
   * str_split() will produce an array with an empty string in it if
 
   * the input string is empty. We just want an empty array in that
 
   * case, but also if there are no input days we can just shortcut
 
   * and return no days.
 
   */
 
  if (empty($days_str))
 
    return '';
 

	
 
  return school_crawl_days_format($school_crawl_log, str_split($days_str));
 
}
 

	
 
/**
 
 * \brief
 
 *   Try to guess a more standardized section_meeting type.
 
 *
 
 * \param $meeting_type
 
 *   The upstream's meeting_type, such as 'LEC', 'lec', 'LAB',
 
 *   etc. New mappings should be added to this function as long as
 
 *   they are general enough.
 
 */
 
function school_crawl_meeting_type($meeting_type = 'lecture')
 
{
 
  static $meeting_type_maps =
 
    array(
 
	  'lec' => 'lecture',
 
	  'lab' => 'lab',
 
	  'dis' => 'discussion',
 
	  );
 

	
 
  if (empty($meeting_type))
 
    $meeting_type = 'lecture';
 

	
 
  $meeting_type = strtolower(trim($meeting_type));
 
@@ -583,48 +590,50 @@ function school_crawl_url($orig_url, $ur
 
{
 
  /*
 
   * This accounts for both if the $url is already an absolute, fully
 
   * qualified URL. It falls back to the original URL if it fails to
 
   * match.
 
   */
 
  foreach (array($url, $orig_url) as $aurl)
 
    if (preg_match(';^(https?)://([^/]+)(/.*)$;', $aurl, $matches))
 
      {
 
	$new_url['schema'] = $matches[1];
 
	$new_url['hostname'] = $matches[2];
 
	$new_url['path'] = $matches[3];
 
      }
 

	
 
  /* check if we have an absolute relative path */
 
  if (!strncmp($url, '/', 1))
 
    $new_url['path'] = $url;
 

	
 
  /* relative */
 
  while (!strncmp($url, '../', 3))
 
    {
 
      $new_url['path'] = preg_replace(';[^/]+/[^/]+$;', '/', dirname($new_url['path']));
 
      $url = substr($url, 3);
 
    }
 
  if (strncmp($url, '/', 1) && strpos($url, '://') === FALSE)
 
    $new_url['path'] .= $url;
 

	
 
  return $new_url['schema'] . '://' . $new_url['hostname'] . $new_url['path'];
 
}
 

	
 
/**
 
 * \brief
 
 *   Map a name onto a column of the table with the help of <th />.
 
 *
 
 * This should be a quite reliable way of matching the data that a
 
 * user sees onto the actual data because, in most cases, HTML writers
 
 * are forced to properly align <th /> and the following hundreds of
 
 * <td />s for there to be a visual alignment.
 
 *
 
 * \param $tr_node
 
 *   The <tr /> with the <th /> elements to resolve.
 
 * \param $column_name
 
 *   The name of the column to search for.
 
 * \param $strcmp
 

	
 
 *   The function to use with a strcmp($text_content, $column_name) interface when judging
 
 *   whether or not a <th />'s textContent matches $column_name.
 

	
 
 * \param $trim
 
 *   The function to apply to the <th />'s textContent before
0 comments (0 inline, 0 general)