]> scripts.mit.edu Git - autoinstalls/mediawiki.git/blob - maintenance/dumpCategoriesAsRdf.php
MediaWiki 1.30.2-scripts
[autoinstalls/mediawiki.git] / maintenance / dumpCategoriesAsRdf.php
1 <?php
2 /**
3  * This program is free software; you can redistribute it and/or modify
4  * it under the terms of the GNU General Public License as published by
5  * the Free Software Foundation; either version 2 of the License, or
6  * (at your option) any later version.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License along
14  * with this program; if not, write to the Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16  * http://www.gnu.org/copyleft/gpl.html
17  *
18  */
19 use Wikimedia\Purtle\RdfWriter;
20 use Wikimedia\Purtle\RdfWriterFactory;
21 use Wikimedia\Rdbms\IDatabase;
22
23 require_once __DIR__ . '/Maintenance.php';
24
25 /**
26  * Maintenance script to provide RDF representation of the category tree.
27  *
28  * @ingroup Maintenance
29  * @since 1.30
30  */
31 class DumpCategoriesAsRdf extends Maintenance {
32         /**
33          * @var RdfWriter
34          */
35         private $rdfWriter;
36         /**
37          * Categories RDF helper.
38          * @var CategoriesRdf
39          */
40         private $categoriesRdf;
41
42         public function __construct() {
43                 parent::__construct();
44
45                 $this->addDescription( "Generate RDF dump of categories in a wiki." );
46
47                 $this->setBatchSize( 200 );
48                 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
49                         false, true );
50                 $this->addOption( 'format', "Set the dump format.", false, true );
51         }
52
53         /**
54          * Produce row iterator for categories.
55          * @param IDatabase $dbr Database connection
56          * @return RecursiveIterator
57          */
58         public function getCategoryIterator( IDatabase $dbr ) {
59                 $it = new BatchRowIterator(
60                         $dbr,
61                         'page',
62                         [ 'page_title' ],
63                         $this->mBatchSize
64                 );
65                 $it->addConditions( [
66                         'page_namespace' => NS_CATEGORY,
67                 ] );
68                 $it->setFetchColumns( [ 'page_title', 'page_id' ] );
69                 return $it;
70         }
71
72         /**
73          * Get iterator for links for categories.
74          * @param IDatabase $dbr
75          * @param array $ids List of page IDs
76          * @return Traversable
77          */
78         public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
79                 $it = new BatchRowIterator(
80                         $dbr,
81                         'categorylinks',
82                         [ 'cl_from', 'cl_to' ],
83                         $this->mBatchSize
84                 );
85                 $it->addConditions( [
86                         'cl_type' => 'subcat',
87                         'cl_from' => $ids
88                 ] );
89                 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
90                 return new RecursiveIteratorIterator( $it );
91         }
92
93         public function addDumpHeader( $timestamp ) {
94                 global $wgRightsUrl;
95                 $licenseUrl = $wgRightsUrl;
96                 if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
97                         $licenseUrl = 'https:' . $licenseUrl;
98                 }
99                 $this->rdfWriter->about( wfExpandUrl( '/categoriesDump', PROTO_CANONICAL ) )
100                         ->a( 'schema', 'Dataset' )
101                         ->a( 'owl', 'Ontology' )
102                         ->say( 'cc', 'license' )->is( $licenseUrl )
103                         ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
104                         ->say( 'schema', 'dateModified' )
105                                 ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
106                         ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
107                         ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
108         }
109
110         public function execute() {
111                 $outFile = $this->getOption( 'output', 'php://stdout' );
112
113                 if ( $outFile === '-' ) {
114                         $outFile = 'php://stdout';
115                 }
116
117                 $output = fopen( $outFile, 'w' );
118                 $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
119                 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
120
121                 $this->categoriesRdf->setupPrefixes();
122                 $this->rdfWriter->start();
123
124                 $this->addDumpHeader( time() );
125                 fwrite( $output, $this->rdfWriter->drain() );
126
127                 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
128
129                 foreach ( $this->getCategoryIterator( $dbr ) as $batch ) {
130                         $pages = [];
131                         foreach ( $batch as $row ) {
132                                 $this->categoriesRdf->writeCategoryData( $row->page_title );
133                                 $pages[$row->page_id] = $row->page_title;
134                         }
135
136                         foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
137                                 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
138                         }
139                         fwrite( $output, $this->rdfWriter->drain() );
140                 }
141                 fflush( $output );
142                 if ( $outFile !== '-' ) {
143                         fclose( $output );
144                 }
145         }
146
147         /**
148          * @param string $format Writer format
149          * @return RdfWriter
150          */
151         private function createRdfWriter( $format ) {
152                 $factory = new RdfWriterFactory();
153                 return $factory->getWriter( $factory->getFormatName( $format ) );
154         }
155 }
156
157 $maintClass = "DumpCategoriesAsRdf";
158 require_once RUN_MAINTENANCE_IF_MAIN;