Scrape peraturan BPK dengan PHP, DiDOM dan Supabase

Eko Priyanto - Mar 2 - - Dev Community

Image description
Direktori Peraturan di Indonesia saat ini:

  • JDIHN (lambat)
  • Peraturan BPK (yang kita coa kali ini)
  • Peraturan.go.id (belum nyoba)
  • lainnya? (beritahu saya)

DiDOM adalah Simple and fast HTML and XML parser
Kalau mungkin sudah terbiasa dengan simple html dom maka DiDOM ini adalah alternatif yang lebih modern.
URL: https://github.com/Imangazaliev/DiDOM

Pertama kita buat table di Supabase

CREATE TABLE peraturan_bpk (
    id SERIAL PRIMARY KEY,
    tipe_dokumen TEXT,
    judul TEXT,
    t_e_u TEXT,
    nomor TEXT,
    bentuk TEXT,
    bentuk_singkat TEXT,
    tahun TEXT,
    tempat_penetapan TEXT,
    tanggal_penetapan TEXT,
    tanggal_pengundangan TEXT,
    tanggal_berlaku TEXT,
    sumber TEXT,
    subjek TEXT,
    status TEXT,
    bahasa TEXT,
    lokasi TEXT,
    bidang TEXT,
    judul_singkat TEXT,
    brief TEXT,
    abstract TEXT,
    catatan TEXT,
    nama_file TEXT,
    link_download TEXT,
    status_peraturan TEXT,
    status_peraturan_item TEXT,
    url_id TEXT
);
Enter fullscreen mode Exit fullscreen mode

Script lengkap + simpan ke supabase

<?php
error_reporting(0);
ini_set('display_errors', 0);


// Supabase API Config
$supabaseUrl = 'https://hoyo.supabase.co';
$supabaseKey = 'Nd14s3nd4sm03';

$tabel = 'peraturan_bpk';

require 'vendor/autoload.php';

use DiDom\Document;

function safe_url($text) {
    return trim(preg_replace('/[^a-z0-9]+/i', '_', strtolower($text)), '_');
}

function cleanText($text) {
    if (!is_string($text) || empty(trim($text))) {
        return '';
    }
    $text = preg_replace('/\s+/', ' ', $text);
    $text = preg_replace('/[^a-zA-Z0-9 .,;-]/', '', $text);
    return trim($text);
}


$start = 314198;

for ($i = $start; $i >= 0; $i--) {

        $uri = "https://peraturan.bpk.go.id/Details/$i";
        $html = @file_get_contents($uri);

        if ($html === false) {
            echo "\033[33m [$i] \033[0m  \e[31m [GAGAL]  \e[0m page not found \n";
            continue;
        }

        $data = array();
        $document = new Document($html);

        if ($document->has('.fs-6')) {
            $keys = [];
            $vals = [];

            $key_divs = $document->find('.col-lg-3');
            foreach ($key_divs as $key_div) {
                $keys[] = safe_url($key_div->text());
            }

            $val_divs = $document->find('.col-lg-9');
            foreach ($val_divs as $val_div) {
                $vals[] = cleanText($val_div->text());
            }

            $keys = array_slice($keys, 0, 17, true);
            $vals = array_slice($vals, 0, 17, true);
            $data = array_combine($keys, $vals);
        } else {
            echo "\033[33m [$i] \033[0m  \e[31m [GAGAL]  \e[0m belum ada isinya \n";
            continue;
        }

        $data['judul_singkat'] = cleanText($document->first('h1')->text() ?? '');
        $data['brief'] = $document->has('.card-body p') ? cleanText($document->first('.card-body p')->text()) : '';

        if ($document->has('.dash')) {
            $abs = [];
            $li_abstract = $document->first('.dash')->find('li');
            foreach ($li_abstract as $a) {
                $abs[] = cleanText($a->text());
            }
            $data['abstract'] = implode('<br />', $abs);

            if (isset($document->find('.dash')[1])) {
                $cat = [];
                $li_catatan = $document->find('.dash')[1]->find('li');
                foreach ($li_catatan as $c) {
                    $cat[] = cleanText($c->text());
                }
                $data['catatan'] = implode('<br />', $cat);
            }
        } else {
            $data['abstract'] = '';
            $data['catatan'] = '';
        }

        if ($document->has('.fs-6')) {
            $div_download = $document->find('.fs-6')[1];
            $link = $div_download->find('a');
            if (isset($link[0]) && isset($link[2])) {
                $data['nama_file'] = cleanText($link[0]->text());
                $data['link_download'] = 'https://peraturan.bpk.go.id' . $link[2]->getAttribute('href');
            }
        }

        if ($document->has('.fs-6') && isset($document->find('.fs-6')[2])) {
            $div_status = $document->find('.fs-6')[2];

            $data['status_peraturan'] = $div_status->has('.bg-light-primary') ? cleanText($div_status->first('.bg-light-primary')->text()) : 'Belum Tersedia';

            $lsp = [];
            $li_status_peraturan = $div_status->find('li');
            foreach ($li_status_peraturan as $li) {
                $lsp[] = cleanText($li->text());
            }
            $data['status_peraturan_item'] = implode('<br />', $lsp);
        }

        $data['url_id'] = $i;

        // Hapus field kosong/null
        $data = array_filter($data, function($value) {
            return $value !== null && $value !== '';
        });

        // Simpan ke Supabase
        $ch = curl_init("$supabaseUrl/rest/v1/$tabel");  // Ganti `peraturan` dengan nama tabel di Supabase
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($ch, CURLOPT_HTTPHEADER, [
            "Content-Type: application/json",
            "apikey: $supabaseKey",
            "Authorization: Bearer $supabaseKey",
            "Prefer: resolution=merge-duplicates"
        ]);
        curl_setopt($ch, CURLOPT_POST, true);
        curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($data));

        $response = curl_exec($ch);
        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
        curl_close($ch);

        if ($httpCode === 201 || $httpCode === 200) {
            echo "\033[33m [$i] \033[0m \033[32m  [SUKSES]  \033[0m   Simpan ke Supabase \n";
        } else {
            echo "\033[33m [$i] \033[0m  \e[31m [GAGAL]  \e[0m menyimpan data ke Supabase \n Status: \033[34m $response \e[0m \n";
            continue;
        }


    unset($html, $document, $data);

    usleep(rand(100000, 9900000));
}
Enter fullscreen mode Exit fullscreen mode

Jalankan di terminal CPanel

nohup php script.php > output.log 2>&1 &
//atau
nohup php script.php
Enter fullscreen mode Exit fullscreen mode

kalau di CPanel lihat lognya
Lognya di file nohup.out

Mematikan proses berjalan dengan nohup

pgrep -f script.php

Enter fullscreen mode Exit fullscreen mode

Image description

. . . . . . . . . . . . . . . . . . . . . . .