#!/usr/bin/perl # OLAC data provider mainline for perseus use strict; use Perseus; use Ptext::Info; use CGI -newstyle_urls; use DBI; use POSIX qw(strftime); use Time::Timezone; require "access.pl"; my $version = "1.1"; my $STAT_OK = 200; my $STAT_ERROR = 400; my $pers_pfx = "http://www.perseus.tufts.edu"; # Namespace URIs: pointers to human-readable description of meta-data format my %schemata = (oai_dc => "http://purl.org/dc/elements/1.1/", olac => "http://www.language-archives.org/OLAC/0.2/", perseus => "$pers_pfx/persmeta.dtd", ); # Schema location: pointer to schema for meta-data format my %schemata_loc = (oai_dc => "http://www.openarchives.org/OAI/$version/dc.xsd", olac => "http://www.language-archives.org/OLAC/olac-0.2.xsd", perseus => "$pers_pfx/persmeta.xsd", ); my $xsi = "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\""; # The perseus:Status metadata field indicates whether the text is # published yet. We get it from a subroutine because development # machines can show texts that public ones can't; this routine # checks what kind of server we are and adjusts the list as needed. my %validStatus = map {$_, 1} ('', valid_status_list()); my $cgi = new CGI; my $verb = $cgi->param('verb'); # potential other arguments, used by some verbs my $id = $cgi->param('identifier'); my $mpfx = $cgi->param('metadataPrefix'); my $until = $cgi->param('until'); # a date my $from = $cgi->param('from'); # a date my $set = $cgi->param('set'); # a set identifier my $token = $cgi->param('resumptionToken'); # convert external forms to internal forms as needed $id =~ s/^oai:perseus://; $set = "Perseus:collection:$set" if ($set ne '' and $set !~ /^Perseus:collection:/); $set =~ s/9/-/g; my $status = $STAT_OK; my $stat_msg = ""; my $opener = "<$verb xmlns=\"http://www.openarchives.org/OAI/$version/OAI_$verb\" \n"; $opener .= " $xsi \n"; $opener .= " xsi:schemaLocation=\"http://www.openarchives.org/OAI/$version/OAI_$verb \n"; $opener .= " http://www.openarchives.org/OAI/$version/OAI_$verb.xsd\">\n"; my $closer = ""; # The datestamp in the reply header is supposed to be the date of the most # recent change to the object's metadata. Since we're currently not # storing that, I'll use date.available as a first approximation. # Main structure is a switch on $verb. For each verb, first check that we have # suitable arguments, then read the necessary records and turn them into # appropriate XML. The generation of XML happens in format_record. my $content = ""; if ($verb eq 'GetRecord') { # required: id and mpfx if ($id eq '') { $status = $STAT_ERROR; $stat_msg = "Identifier not specified for $verb"; } check_mpfx($mpfx, $verb, $status, $stat_msg); goto report_error unless ($status == $STAT_OK); $content = format_record($id, $mpfx); if ($content eq '') { $status = $STAT_ERROR; $stat_msg = "Record $id not found for $verb"; goto report_error; } } elsif ($verb eq 'Identify') { # no arguments # Since this stuff rarely changes, it does not require a database access. $content = "Perseus Digital Library\n"; $content .= "$pers_pfx/cgi-bin/pdataprov\n"; $content .= "$version\n"; $content .= "mailto:webmaster\@perseus.tufts.edu\n"; # if OLAC agrees on a description container, use it here # $content .= "...\n: } elsif ($verb eq 'ListIdentifiers' or $verb eq 'ListRecords') { # optional: until or from or set, exclusive: resumption token check_token($token, $until, $from, $set, $verb, $status, $stat_msg); check_mpfx($mpfx, $verb, $status, $stat_msg) if ($token eq '' and $verb eq 'ListRecords'); goto report_error unless ($status == $STAT_OK); # The use of the flat file here is an optimization. Much of the code # here comes from perscoll, the Perseus Collection Browser (table of # contents routine). Since that routine is run very frequently, it # uses a pre-built list of metadata records in a file instead of making # a database query every time. For perscoll, it turns out to be faster # to read in all the metadata records and filter them here than to execute # a query that selects just the appropriate ones; this is related to the way # we store metadata, which is pretty efficient if you're looking up one object, # but messy if you want all the fields for only some of the records. # The present routine will not be used so intensively, so queries the database # for each individual record because that's easier to write. my $metafile = "DBs/ptext.db"; if (not open METAFILE, $metafile) { $status = $STAT_ERROR; $stat_msg = "Unable to open $metafile -- $!"; goto report_error; } my $curid = ''; my %currec = (); my @records = (); while (my $rec = ) { # records are tab-delimited, and "\N" stands for a null value $rec =~ s/\t\\N/\t/g; my ($id, $subdoc, $ns, $key, $subkey, $schema, $value, $valueid, $lang) = split /\t/, $rec; next unless ($id =~ /:text:/); if ($id ne $curid) { if ($curid ne '' and use_record(\%currec, $curid, $set, $until, $from)) { push @records, [ $curid, { %currec } ]; } %currec = (); $curid = $id; } my $typekey = "$ns:$key"; $typekey .= ".$subkey" if $subkey; $typekey .= "[$schema]" if $schema; push @{ $currec{$typekey} }, [ $value, $valueid, $lang ]; } if ($curid ne '' and use_record(\%currec, $curid, $set, $until, $from)) { push @records, [ $curid, { %currec } ]; } close METAFILE; $content = ""; foreach my $rec (@records) { my ($id, $data) = @{ $rec }; if ($verb eq 'ListIdentifiers') { $content .= "oai:perseus:$id\n"; } else { # as noted above, this is inefficient because it goes to the # database for each object, when we've already got the metadata # in hand from the flat file -- but it's easier to maintain $content .= format_record($id, $mpfx); } } } elsif ($verb eq 'ListMetadataFormats') { # optional: id # since we can supply the same meta-data formats for any record, # we ignore the identifier argument # Plain Dublin Core $content = "oai_dc\n"; $content .= "$schemata_loc{'oai_dc'}\n"; # OLAC form $content .= "olac\n"; $content .= "$schemata_loc{'olac'}\n"; # Perseus extended meta-data (not supplied yet) $content .= "perseus\n"; $content .= "$schemata_loc{'perseus'}\n"; } elsif ($verb eq 'ListSets') { # exclusive: token # The Perseus collections are a natural use for sets. my ($colls, $collPretty, $collSort, $collDescr, $collOverview) = all_collections(); $content = ""; foreach my $coll (@{$colls}) { next if ($coll eq 'Any'); my $show_coll = $coll; $show_coll =~ s/-/9/g; # set specs must be alphanumeric parts joined by colons $content .= "\n$show_coll\n"; $content .= "" . $$collPretty{$coll} . "\n\n"; } } else { $status = $STAT_ERROR; $stat_msg = "Unrecognized verb $verb"; } report_error:; if ($status != $STAT_OK) { print $cgi->header(-type=>"text/xml", -status=>$status, -reason_phrase=>$stat_msg); print "\n"; print "<$verb>$stat_msg\n"; exit $status; } # if we're here, we have built usable content: now we can emit it print $cgi->header(-type=>"text/xml", -status=>$status); print "\n"; print "$opener\n"; my $zone = tz_local_offset(); my $tz_min = $zone / 60; my $tz_hours = $tz_min / 60; my $tail = sprintf("%+02.2d:%02d", $tz_hours, $zone % 60); print POSIX::strftime("%Y-%m-%dT%H:%M:%S$tail\n", localtime); my $request = "http://$ENV{'SERVER_NAME'}" . $cgi->url(-absolute=>1, -query=>1); print "$request\n"; # fix up ampersands, which appear fairly often in the data $content =~ s/&/&/g; # also fix up at signs $content =~ s/@/@/g; print $content; print "$closer\n"; # note we can't use the cgi end-html function, since we're emitting xml, not html # subroutines follow, in alphabetical order sub check_mpfx { my ($mpfx, $verb, $status, $stat_msg) = @_; if ($mpfx eq '') { $status = $STAT_ERROR; if ($stat_msg eq "") { $stat_msg = "Metadata prefix not specified for $verb"; } else { $stat_msg .= " and metadata prefix not specified for $verb"; } } elsif (! defined($schemata{$mpfx})) { $status = $STAT_ERROR; if ($stat_msg eq "") { $stat_msg = "Unrecognized metadata prefix $mpfx for $verb"; } else { $stat_msg .= " and unrecognized metadata prefix $mpfx for $verb"; } } # update the return arguments @_[2] = $status; @_[3] = $stat_msg; } sub check_token { my ($token, $until, $from, $set, $verb, $status, $stat_msg) = @_; if (($token ne '') and ($until ne '' or $from ne '' or $set ne '')) { $status = $STAT_ERROR; $stat_msg = "Additional arguments specified along with resumptionToken for $verb"; } # update the return arguments @_[5] = $status; @_[6] = $stat_msg; } sub format_record { my ($id, $mpfx) = @_; my $content = ""; # Routine get_doc_info reads the metadata for a given object. # The return value is a reference to a hash. The first key is the # subdocument id, which is a null string for simple texts. It would # be non-null for something like Cicero's speeches, where we have # several speeches in the same XML file. For each subdocument, the # hash value is another hash whose keys are the metadata elements, # in the form ns:Key.Qual[schema] (where "qual" and "schema" are # optional). The namespace "ns" is "dc" for Dublin Core fields, # "perseus" for additional fields we use. The key is the field name, # and the qualifier works as in Dublin Core. The "schema" is a # further qualification, for example to distinguish various # uses of the Subject field. Each value in this hash is an array, # consisting of all the values for this metadata field for this object. # For example, given a text with three authors, $$info{''}{'dc:Creator'}[0] # would be the first author's name, $$info{''}{'dc:Creator'}[1] the second, # and $$info{''}{'dc:Creator'}[2] the third. my $info = get_doc_info($id); if (! %{$info}) { return $content; } my $date_stamp = $$info{''}{'dc:Date.Available'}[0]; $date_stamp = '1985-01-01' if $date_stamp eq ''; # lower bound for Perseus dates $content = "
oai:perseus:$id$date_stamp
\n"; $content .= "\n"; $content .= "<$mpfx xmlns=\"$schemata{$mpfx}\" \n"; $content .= " $xsi \n"; $content .= " xsi:schemaLocation=\"$schemata{$mpfx} \n $schemata_loc{$mpfx}\">"; # fields common to all supported schemata my $lookup; my @dcfields = qw(title creator subject description contributor publisher date type format source relation coverage rights); foreach my $field(@dcfields) { ($lookup = $field) =~ s/(\w+)/\u\L$1/g; $lookup = "dc:$lookup"; foreach my $subdoc (keys %{ $info }) { foreach my $element (@{ $$info{$subdoc}{$lookup} }) { $content .= "<$field>$element"; $content .= " for subdoc $subdoc" if ($subdoc ne ''); $content .= "\n"; } } } # common fields requiring additional handling { foreach my $subdoc (keys %{ $info }) { foreach my $element (@{ $$info{$subdoc}{'dc:Language'} }) { my $lang = language_code($element, $mpfx); $content .= "\n"; } } } # identifier field, not from our metadata but constructed from the document ID # this is where you can actually see the text $content .= "$pers_pfx/cgi-bin/ptext?doc=$id\n"; # fields from OLAC only if ($mpfx eq 'olac') { my @olacfields = ('subject.language', ); foreach my $field (@olacfields) { ($lookup = $field) =~ s/(\w+)/\u\L$1/g; $lookup = "dc:$lookup"; foreach my $subdoc (keys %{ $info }) { foreach my $element (@{ $$info{$subdoc}{$lookup} }) { if ($field =~ /language/) { my $lang = language_code($element, $mpfx); $content .= "<$field identifier=\"$lang\" />\n"; } else { $content .= "<$field>$element"; $content .= " for subdoc $subdoc" if ($subdoc ne ''); $content .= "\n"; } } } } } # fields from Perseus only if ($mpfx eq 'perseus') { my @persfields = qw(citation layout.stylesheet layout.template status funder witness speaker method correctionlevel); foreach my $field (@persfields) { ($lookup = $field) =~ s/(\w+)/\u\L$1/g; $lookup = "perseus:$lookup"; foreach my $subdoc (keys %{ $info }) { foreach my $element (@{ $$info{$subdoc}{$lookup} }) { $content .= "<$field>$element"; $content .= " for subdoc $subdoc" if ($subdoc ne ''); $content .= "\n"; } } } my %persdc = ('date.available' => 'dc:Date.Available', 'relation.isversionof' => '[dc:Relation.IsVersionOf]', 'relation.iscommentaryon' => '[dc:Relation.IsCommentaryOn]'); foreach my $field(keys %persdc) { $lookup = $persdc{$field}; next if ($lookup eq ''); foreach my $subdoc (keys %{ $info }) { foreach my $element (@{ $$info{$subdoc}{$lookup} }) { $content .= "<$field>$element"; $content .= " for subdoc $subdoc" if ($subdoc ne ''); $content .= "\n"; } } } } $content .= "\n"; $content .= "\n
\n"; } sub language_code { my ($in_lang, $mpfx) = @_; my %rfc1766 = (en => 'eng', la => 'lat', greek => 'grc', de => 'ger', es => 'spa', fr => 'fre', it => 'ita', sanskrit => 'san', ); my %ethnologue = (en => 'ENG', la => 'LTN', greek => 'GKO', de => 'GER', es => 'SPN', fr => 'FRN', it => 'ITN', sanskrit => 'SKT', ); my $out_lang; if ($mpfx eq 'olac') { $out_lang = $ethnologue{$in_lang}; $out_lang = "x-sil-$out_lang" unless ($out_lang eq ''); } else { $out_lang = $rfc1766{$in_lang}; } $out_lang = $in_lang if ($out_lang eq ''); $out_lang; } sub use_record { my ($currec, $id, $set, $until, $from) = @_; return 0 unless $validStatus{$$currec{'perseus:Status'}->[0]->[0]}; my $good = 1; # check the type # if it doesn't have a type, assume it's not real $good = exists($$currec{'dc:Type'}); # currently we're only distributing meta-data for texts, # not images, image collections, maps, or tools if ($$currec{'dc:Type'}->[0][0] ne 'text') { $good = 0; } return $good unless $good; # check the series if ($id =~ /\.00\./ and ! $Perseus::development) { $good = 0; } return $good unless $good; # check the collection if ($set ne '') { my $found = 0; foreach my $one_coll (@{ $$currec{'dc:Relation.IsPartOf'} }) { if ($$one_coll[1] eq $set) { $found = 1; last; } } $good = $found; } return $good unless $good; # check the date my $avail = $$currec{'dc:Date.Available'}->[0][0]; if ($from ne '' or $until ne '') { if ($avail eq '') { $good = 0; # treat missing date as out of range } else { my $date_check = 1; $date_check = 0 if ((($from ne '') and ($avail lt $from)) or (($until ne '') and ($avail gt $until))); $good = $date_check; } } return $good unless $good; # more checks could come here if necessary $good; }