#!/usr/bin/perl
# OLAC data provider mainline for perseus
use strict;
use Perseus;
use Ptext::Info;
use CGI -newstyle_urls;
use DBI;
use POSIX qw(strftime);
use Time::Timezone;
require "access.pl";
my $version = "1.1";
my $STAT_OK = 200;
my $STAT_ERROR = 400;
my $pers_pfx = "http://www.perseus.tufts.edu";
# Namespace URIs: pointers to human-readable description of meta-data format
my %schemata = (oai_dc => "http://purl.org/dc/elements/1.1/",
olac => "http://www.language-archives.org/OLAC/0.2/",
perseus => "$pers_pfx/persmeta.dtd",
);
# Schema location: pointer to schema for meta-data format
my %schemata_loc = (oai_dc => "http://www.openarchives.org/OAI/$version/dc.xsd",
olac => "http://www.language-archives.org/OLAC/olac-0.2.xsd",
perseus => "$pers_pfx/persmeta.xsd",
);
my $xsi = "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"";
# The perseus:Status metadata field indicates whether the text is
# published yet. We get it from a subroutine because development
# machines can show texts that public ones can't; this routine
# checks what kind of server we are and adjusts the list as needed.
my %validStatus = map {$_, 1} ('', valid_status_list());
my $cgi = new CGI;
my $verb = $cgi->param('verb');
# potential other arguments, used by some verbs
my $id = $cgi->param('identifier');
my $mpfx = $cgi->param('metadataPrefix');
my $until = $cgi->param('until'); # a date
my $from = $cgi->param('from'); # a date
my $set = $cgi->param('set'); # a set identifier
my $token = $cgi->param('resumptionToken');
# convert external forms to internal forms as needed
$id =~ s/^oai:perseus://;
$set = "Perseus:collection:$set" if ($set ne '' and $set !~ /^Perseus:collection:/);
$set =~ s/9/-/g;
my $status = $STAT_OK;
my $stat_msg = "";
my $opener = "<$verb xmlns=\"http://www.openarchives.org/OAI/$version/OAI_$verb\" \n";
$opener .= " $xsi \n";
$opener .= " xsi:schemaLocation=\"http://www.openarchives.org/OAI/$version/OAI_$verb \n";
$opener .= " http://www.openarchives.org/OAI/$version/OAI_$verb.xsd\">\n";
my $closer = "$verb>";
# The datestamp in the reply header is supposed to be the date of the most
# recent change to the object's metadata. Since we're currently not
# storing that, I'll use date.available as a first approximation.
# Main structure is a switch on $verb. For each verb, first check that we have
# suitable arguments, then read the necessary records and turn them into
# appropriate XML. The generation of XML happens in format_record.
my $content = "";
if ($verb eq 'GetRecord')
{
# required: id and mpfx
if ($id eq '')
{
$status = $STAT_ERROR;
$stat_msg = "Identifier not specified for $verb";
}
check_mpfx($mpfx, $verb, $status, $stat_msg);
goto report_error unless ($status == $STAT_OK);
$content = format_record($id, $mpfx);
if ($content eq '')
{
$status = $STAT_ERROR;
$stat_msg = "Record $id not found for $verb";
goto report_error;
}
}
elsif ($verb eq 'Identify')
{
# no arguments
# Since this stuff rarely changes, it does not require a database access.
$content = "Perseus Digital Library\n";
$content .= "$pers_pfx/cgi-bin/pdataprov\n";
$content .= "$version\n";
$content .= "mailto:webmaster\@perseus.tufts.edu\n";
# if OLAC agrees on a description container, use it here
# $content .= "...\n:
}
elsif ($verb eq 'ListIdentifiers' or $verb eq 'ListRecords')
{
# optional: until or from or set, exclusive: resumption token
check_token($token, $until, $from, $set, $verb, $status, $stat_msg);
check_mpfx($mpfx, $verb, $status, $stat_msg) if ($token eq '' and $verb eq 'ListRecords');
goto report_error unless ($status == $STAT_OK);
# The use of the flat file here is an optimization. Much of the code
# here comes from perscoll, the Perseus Collection Browser (table of
# contents routine). Since that routine is run very frequently, it
# uses a pre-built list of metadata records in a file instead of making
# a database query every time. For perscoll, it turns out to be faster
# to read in all the metadata records and filter them here than to execute
# a query that selects just the appropriate ones; this is related to the way
# we store metadata, which is pretty efficient if you're looking up one object,
# but messy if you want all the fields for only some of the records.
# The present routine will not be used so intensively, so queries the database
# for each individual record because that's easier to write.
my $metafile = "DBs/ptext.db";
if (not open METAFILE, $metafile)
{
$status = $STAT_ERROR;
$stat_msg = "Unable to open $metafile -- $!";
goto report_error;
}
my $curid = '';
my %currec = ();
my @records = ();
while (my $rec = )
{ # records are tab-delimited, and "\N" stands for a null value
$rec =~ s/\t\\N/\t/g;
my ($id, $subdoc, $ns, $key, $subkey, $schema, $value, $valueid, $lang)
= split /\t/, $rec;
next unless ($id =~ /:text:/);
if ($id ne $curid)
{
if ($curid ne '' and use_record(\%currec, $curid, $set, $until, $from))
{
push @records, [ $curid, { %currec } ];
}
%currec = ();
$curid = $id;
}
my $typekey = "$ns:$key";
$typekey .= ".$subkey" if $subkey;
$typekey .= "[$schema]" if $schema;
push @{ $currec{$typekey} }, [ $value, $valueid, $lang ];
}
if ($curid ne '' and use_record(\%currec, $curid, $set, $until, $from))
{
push @records, [ $curid, { %currec } ];
}
close METAFILE;
$content = "";
foreach my $rec (@records)
{
my ($id, $data) = @{ $rec };
if ($verb eq 'ListIdentifiers')
{
$content .= "oai:perseus:$id\n";
}
else
{
# as noted above, this is inefficient because it goes to the
# database for each object, when we've already got the metadata
# in hand from the flat file -- but it's easier to maintain
$content .= format_record($id, $mpfx);
}
}
}
elsif ($verb eq 'ListMetadataFormats')
{
# optional: id
# since we can supply the same meta-data formats for any record,
# we ignore the identifier argument
# Plain Dublin Core
$content = "oai_dc\n";
$content .= "$schemata_loc{'oai_dc'}\n";
# OLAC form
$content .= "olac\n";
$content .= "$schemata_loc{'olac'}\n";
# Perseus extended meta-data (not supplied yet)
$content .= "perseus\n";
$content .= "$schemata_loc{'perseus'}\n";
}
elsif ($verb eq 'ListSets')
{
# exclusive: token
# The Perseus collections are a natural use for sets.
my ($colls, $collPretty, $collSort, $collDescr, $collOverview)
= all_collections();
$content = "";
foreach my $coll (@{$colls})
{
next if ($coll eq 'Any');
my $show_coll = $coll;
$show_coll =~ s/-/9/g; # set specs must be alphanumeric parts joined by colons
$content .= "\n$show_coll\n";
$content .= "" . $$collPretty{$coll} . "\n\n";
}
}
else
{
$status = $STAT_ERROR;
$stat_msg = "Unrecognized verb $verb";
}
report_error:;
if ($status != $STAT_OK)
{
print $cgi->header(-type=>"text/xml", -status=>$status, -reason_phrase=>$stat_msg);
print "\n";
print "<$verb>$stat_msg$verb>\n";
exit $status;
}
# if we're here, we have built usable content: now we can emit it
print $cgi->header(-type=>"text/xml", -status=>$status);
print "\n";
print "$opener\n";
my $zone = tz_local_offset();
my $tz_min = $zone / 60;
my $tz_hours = $tz_min / 60;
my $tail = sprintf("%+02.2d:%02d", $tz_hours, $zone % 60);
print POSIX::strftime("%Y-%m-%dT%H:%M:%S$tail\n", localtime);
my $request = "http://$ENV{'SERVER_NAME'}" . $cgi->url(-absolute=>1, -query=>1);
print "$request\n";
# fix up ampersands, which appear fairly often in the data
$content =~ s/&/&/g;
# also fix up at signs
$content =~ s/@/@/g;
print $content;
print "$closer\n";
# note we can't use the cgi end-html function, since we're emitting xml, not html
# subroutines follow, in alphabetical order
sub check_mpfx
{
my ($mpfx, $verb, $status, $stat_msg) = @_;
if ($mpfx eq '')
{
$status = $STAT_ERROR;
if ($stat_msg eq "")
{
$stat_msg = "Metadata prefix not specified for $verb";
}
else
{
$stat_msg .= " and metadata prefix not specified for $verb";
}
}
elsif (! defined($schemata{$mpfx}))
{
$status = $STAT_ERROR;
if ($stat_msg eq "")
{
$stat_msg = "Unrecognized metadata prefix $mpfx for $verb";
}
else
{
$stat_msg .= " and unrecognized metadata prefix $mpfx for $verb";
}
}
# update the return arguments
@_[2] = $status;
@_[3] = $stat_msg;
}
sub check_token
{
my ($token, $until, $from, $set, $verb, $status, $stat_msg) = @_;
if (($token ne '') and ($until ne '' or $from ne '' or $set ne ''))
{
$status = $STAT_ERROR;
$stat_msg = "Additional arguments specified along with resumptionToken for $verb";
}
# update the return arguments
@_[5] = $status;
@_[6] = $stat_msg;
}
sub format_record {
my ($id, $mpfx) = @_;
my $content = "";
# Routine get_doc_info reads the metadata for a given object.
# The return value is a reference to a hash. The first key is the
# subdocument id, which is a null string for simple texts. It would
# be non-null for something like Cicero's speeches, where we have
# several speeches in the same XML file. For each subdocument, the
# hash value is another hash whose keys are the metadata elements,
# in the form ns:Key.Qual[schema] (where "qual" and "schema" are
# optional). The namespace "ns" is "dc" for Dublin Core fields,
# "perseus" for additional fields we use. The key is the field name,
# and the qualifier works as in Dublin Core. The "schema" is a
# further qualification, for example to distinguish various
# uses of the Subject field. Each value in this hash is an array,
# consisting of all the values for this metadata field for this object.
# For example, given a text with three authors, $$info{''}{'dc:Creator'}[0]
# would be the first author's name, $$info{''}{'dc:Creator'}[1] the second,
# and $$info{''}{'dc:Creator'}[2] the third.
my $info = get_doc_info($id);
if (! %{$info})
{
return $content;
}
my $date_stamp = $$info{''}{'dc:Date.Available'}[0];
$date_stamp = '1985-01-01' if $date_stamp eq ''; # lower bound for Perseus dates
$content = "oai:perseus:$id$date_stamp\n";
$content .= "\n";
$content .= "<$mpfx xmlns=\"$schemata{$mpfx}\" \n";
$content .= " $xsi \n";
$content .= " xsi:schemaLocation=\"$schemata{$mpfx} \n $schemata_loc{$mpfx}\">";
# fields common to all supported schemata
my $lookup;
my @dcfields = qw(title creator subject description contributor
publisher date type format source
relation coverage rights);
foreach my $field(@dcfields)
{
($lookup = $field) =~ s/(\w+)/\u\L$1/g;
$lookup = "dc:$lookup";
foreach my $subdoc (keys %{ $info })
{
foreach my $element (@{ $$info{$subdoc}{$lookup} })
{
$content .= "<$field>$element";
$content .= " for subdoc $subdoc" if ($subdoc ne '');
$content .= "$field>\n";
}
}
}
# common fields requiring additional handling
{
foreach my $subdoc (keys %{ $info })
{
foreach my $element (@{ $$info{$subdoc}{'dc:Language'} })
{
my $lang = language_code($element, $mpfx);
$content .= "\n";
}
}
}
# identifier field, not from our metadata but constructed from the document ID
# this is where you can actually see the text
$content .= "$pers_pfx/cgi-bin/ptext?doc=$id\n";
# fields from OLAC only
if ($mpfx eq 'olac')
{
my @olacfields = ('subject.language',
);
foreach my $field (@olacfields)
{
($lookup = $field) =~ s/(\w+)/\u\L$1/g;
$lookup = "dc:$lookup";
foreach my $subdoc (keys %{ $info })
{
foreach my $element (@{ $$info{$subdoc}{$lookup} })
{
if ($field =~ /language/)
{
my $lang = language_code($element, $mpfx);
$content .= "<$field identifier=\"$lang\" />\n";
}
else
{
$content .= "<$field>$element";
$content .= " for subdoc $subdoc" if ($subdoc ne '');
$content .= "$field>\n";
}
}
}
}
}
# fields from Perseus only
if ($mpfx eq 'perseus')
{
my @persfields = qw(citation layout.stylesheet layout.template
status funder witness speaker
method correctionlevel);
foreach my $field (@persfields)
{
($lookup = $field) =~ s/(\w+)/\u\L$1/g;
$lookup = "perseus:$lookup";
foreach my $subdoc (keys %{ $info })
{
foreach my $element (@{ $$info{$subdoc}{$lookup} })
{
$content .= "<$field>$element";
$content .= " for subdoc $subdoc" if ($subdoc ne '');
$content .= "$field>\n";
}
}
}
my %persdc = ('date.available' => 'dc:Date.Available',
'relation.isversionof' => '[dc:Relation.IsVersionOf]',
'relation.iscommentaryon' => '[dc:Relation.IsCommentaryOn]');
foreach my $field(keys %persdc)
{
$lookup = $persdc{$field};
next if ($lookup eq '');
foreach my $subdoc (keys %{ $info })
{
foreach my $element (@{ $$info{$subdoc}{$lookup} })
{
$content .= "<$field>$element";
$content .= " for subdoc $subdoc" if ($subdoc ne '');
$content .= "$field>\n";
}
}
}
}
$content .= "$mpfx>\n";
$content .= "\n\n";
}
sub language_code
{
my ($in_lang, $mpfx) = @_;
my %rfc1766 = (en => 'eng',
la => 'lat',
greek => 'grc',
de => 'ger',
es => 'spa',
fr => 'fre',
it => 'ita',
sanskrit => 'san',
);
my %ethnologue = (en => 'ENG',
la => 'LTN',
greek => 'GKO',
de => 'GER',
es => 'SPN',
fr => 'FRN',
it => 'ITN',
sanskrit => 'SKT',
);
my $out_lang;
if ($mpfx eq 'olac')
{
$out_lang = $ethnologue{$in_lang};
$out_lang = "x-sil-$out_lang" unless ($out_lang eq '');
}
else
{
$out_lang = $rfc1766{$in_lang};
}
$out_lang = $in_lang if ($out_lang eq '');
$out_lang;
}
sub use_record
{
my ($currec, $id, $set, $until, $from) = @_;
return 0 unless $validStatus{$$currec{'perseus:Status'}->[0]->[0]};
my $good = 1;
# check the type
# if it doesn't have a type, assume it's not real
$good = exists($$currec{'dc:Type'});
# currently we're only distributing meta-data for texts,
# not images, image collections, maps, or tools
if ($$currec{'dc:Type'}->[0][0] ne 'text')
{
$good = 0;
}
return $good unless $good;
# check the series
if ($id =~ /\.00\./ and ! $Perseus::development)
{
$good = 0;
}
return $good unless $good;
# check the collection
if ($set ne '')
{
my $found = 0;
foreach my $one_coll (@{ $$currec{'dc:Relation.IsPartOf'} })
{
if ($$one_coll[1] eq $set)
{
$found = 1;
last;
}
}
$good = $found;
}
return $good unless $good;
# check the date
my $avail = $$currec{'dc:Date.Available'}->[0][0];
if ($from ne '' or $until ne '')
{
if ($avail eq '')
{
$good = 0; # treat missing date as out of range
}
else
{
my $date_check = 1;
$date_check = 0 if ((($from ne '') and ($avail lt $from))
or (($until ne '') and ($avail gt $until)));
$good = $date_check;
}
}
return $good unless $good;
# more checks could come here if necessary
$good;
}