X-Git-Url: https://scripts.mit.edu/gitweb/autoinstallsdev/mediawiki.git/blobdiff_plain/19e297c21b10b1b8a3acad5e73fc71dcb35db44a..6932310fd58ebef145fa01eb76edf7150284d8ea:/maintenance/dumpCategoriesAsRdf.php diff --git a/maintenance/dumpCategoriesAsRdf.php b/maintenance/dumpCategoriesAsRdf.php new file mode 100644 index 00000000..ff50498f --- /dev/null +++ b/maintenance/dumpCategoriesAsRdf.php @@ -0,0 +1,158 @@ +addDescription( "Generate RDF dump of categories in a wiki." ); + + $this->setBatchSize( 200 ); + $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", + false, true ); + $this->addOption( 'format', "Set the dump format.", false, true ); + } + + /** + * Produce row iterator for categories. + * @param IDatabase $dbr Database connection + * @return RecursiveIterator + */ + public function getCategoryIterator( IDatabase $dbr ) { + $it = new BatchRowIterator( + $dbr, + 'page', + [ 'page_title' ], + $this->mBatchSize + ); + $it->addConditions( [ + 'page_namespace' => NS_CATEGORY, + ] ); + $it->setFetchColumns( [ 'page_title', 'page_id' ] ); + return $it; + } + + /** + * Get iterator for links for categories. + * @param IDatabase $dbr + * @param array $ids List of page IDs + * @return Traversable + */ + public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) { + $it = new BatchRowIterator( + $dbr, + 'categorylinks', + [ 'cl_from', 'cl_to' ], + $this->mBatchSize + ); + $it->addConditions( [ + 'cl_type' => 'subcat', + 'cl_from' => $ids + ] ); + $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); + return new RecursiveIteratorIterator( $it ); + } + + public function addDumpHeader( $timestamp ) { + global $wgRightsUrl; + $licenseUrl = $wgRightsUrl; + if ( substr( $licenseUrl, 0, 2 ) == '//' ) { + $licenseUrl = 'https:' . $licenseUrl; + } + $this->rdfWriter->about( wfExpandUrl( '/categoriesDump', PROTO_CANONICAL ) ) + ->a( 'schema', 'Dataset' ) + ->a( 'owl', 'Ontology' ) + ->say( 'cc', 'license' )->is( $licenseUrl ) + ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION ) + ->say( 'schema', 'dateModified' ) + ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' ) + ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) ) + ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL ); + } + + public function execute() { + $outFile = $this->getOption( 'output', 'php://stdout' ); + + if ( $outFile === '-' ) { + $outFile = 'php://stdout'; + } + + $output = fopen( $outFile, 'w' ); + $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) ); + $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); + + $this->categoriesRdf->setupPrefixes(); + $this->rdfWriter->start(); + + $this->addDumpHeader( time() ); + fwrite( $output, $this->rdfWriter->drain() ); + + $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); + + foreach ( $this->getCategoryIterator( $dbr ) as $batch ) { + $pages = []; + foreach ( $batch as $row ) { + $this->categoriesRdf->writeCategoryData( $row->page_title ); + $pages[$row->page_id] = $row->page_title; + } + + foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) { + $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); + } + fwrite( $output, $this->rdfWriter->drain() ); + } + fflush( $output ); + if ( $outFile !== '-' ) { + fclose( $output ); + } + } + + /** + * @param string $format Writer format + * @return RdfWriter + */ + private function createRdfWriter( $format ) { + $factory = new RdfWriterFactory(); + return $factory->getWriter( $factory->getFormatName( $format ) ); + } +} + +$maintClass = "DumpCategoriesAsRdf"; +require_once RUN_MAINTENANCE_IF_MAIN;