第六天:完成爬蟲


Posted by luckstar77 on 2022-10-27

做了六天總算把爬蟲部分整個完成,現在已經可以自動爬 ETF 及個股的除權息資料,包含有股價、歷年平均殖利率、統計年分、填權息次數、填權息成功率等

import { connect as redisConnect } from './db/redis';
import { connect as mongodbConnect } from './db/mongodb';
import axios from 'axios';
import * as acorn from 'acorn';
import * as cheerio from 'cheerio';
import * as R from 'ramda';

const COLLECTION = 'stock';
const STOCK_IDS_URL = 'https://goodinfo.tw/tw/StockLib/js/TW_STOCK_ID_NM_LIST.js?0';
const DIVIDEND_PREFIX_URL = 'https://goodinfo.tw/tw/StockDividendPolicy.asp?STOCK_ID=';

enum DividendState {
    SUCCESS,
    FAILURE,
    NOTHING
}

interface Dividend {
    [key: string]: DividendState[]
}

(async () => {
    const mongodbClient = await mongodbConnect();
    const redisClient = await redisConnect();

    let stockIdsIndex:number = parseInt((await redisClient.get('STOCK_ID_INDEX'))!);
    const {data: stockIdsText} = await axios.get(STOCK_IDS_URL);
    const stockIdsParsed = acorn.parse(
        stockIdsText,
        { ecmaVersion: 2020 }
    );

    // TODO: https://github.com/acornjs/acorn/issues/741
    // eslint-disable-next-line @typescript-eslint/ban-ts-comment
    // @ts-ignore
    const stockIds: {value: string}[] = stockIdsParsed.body[0].declarations[0].init.elements;

    // stockIds the first useful value is third element.
    const STOCK_IDS_MAX = stockIds.length;
    if(stockIdsIndex >= STOCK_IDS_MAX) stockIdsIndex = 2;
    if(stockIdsIndex < 2) stockIdsIndex = 2;
    const [stockId, stockName] = stockIds[stockIdsIndex].value.split(' ');

    // TODO: https://github.com/acornjs/acorn/issues/741
    // eslint-disable-next-line @typescript-eslint/ban-ts-comment
    // @ts-ignore
    const {data: dividendText} = await axios.get(DIVIDEND_PREFIX_URL + stockId);
    const $ = cheerio.load(dividendText);
    const price = parseFloat($('body > table:nth-child(8) > tbody > tr > td:nth-child(3) > table:nth-child(1) > tbody > tr > td:nth-child(1) > table > tbody > tr:nth-child(3) > td:nth-child(1)').text());
    const allAvgCashYields = parseFloat($('#divDividendSumInfo > div > div > table > tbody > tr:nth-child(4) > td:nth-child(5)').text());
    const allAvgRetroactiveYields = parseFloat($('#divDividendSumInfo > div > div > table > tbody > tr:nth-child(6) > td:nth-child(5)').text());
    if (isNaN(price) || allAvgRetroactiveYields === 0 || isNaN(allAvgRetroactiveYields)) {
        await redisClient.incr('STOCK_ID_INDEX');
        process.exit();
    }

    let yearText:string;
    let year:number;
    const $trs = $('#tblDetail > tbody > tr');
    const dividends: Dividend = {};
    for(let i = 4; i < $trs.length - 1; i++) {
        let dividendState: DividendState = DividendState.NOTHING; 
        const $dividendTr = $trs.eq(i);
        yearText = $dividendTr.children('td').eq(0).text();
        if(!isNaN(yearText as any)) {
            year = parseInt(yearText);
            dividends[year] = [];
        } else if(yearText !== '∟') continue;
        const cashDividendText = $dividendTr.children('td').eq(3).text();
        const stockDividendText = $dividendTr.children('td').eq(6).text();
        const cashDividendSpendDaysText = $dividendTr.children('td').eq(10).text();
        const stockDividendSpendDaysText = $dividendTr.children('td').eq(11).text();
        if(cashDividendText !== '0' && cashDividendText !== '-') {
            if(cashDividendSpendDaysText !== '-') dividendState = DividendState.SUCCESS;
            else dividendState = DividendState.FAILURE;
        }
        if(stockDividendText !== '0' && stockDividendText !== '-' && dividendState !== DividendState.FAILURE) {
            if(stockDividendSpendDaysText !== '-') dividendState = DividendState.SUCCESS;
            else dividendState = DividendState.FAILURE;
        }
        dividends[year!].push(dividendState);
    }
    const dividendsValues = R.values(dividends);
    const dividendsYears = R.keys(dividends);
    const amountOfDividend = dividendsValues.length;
    if(amountOfDividend === 0) {
        await redisClient.incr('STOCK_ID_INDEX');
        process.exit();
    }

    const dividendsFailureObject = R.filter(value => {
        if(value.length === 1) {
            if(value[0] === DividendState.FAILURE) return true;
            else return false;
        }

        return R.any(R.equals(1))( R.splitAt(1, value)[1]);

    }, dividends);
    const dividendsFailures = R.keys(dividendsFailureObject);
    const amountOfSuccess = amountOfDividend - dividendsFailures.length;
    const successRate = (amountOfSuccess / amountOfDividend) * 100.00;
    const dividendYearStart = dividendsYears[0];
    const dividendYearEnd = dividendsYears[amountOfDividend - 1];

    const updated = await mongodbClient.collection(COLLECTION).updateOne({
        id: stockId
    }, {
        $set: { 
            name: stockName, 
            successRate,
            allAvgCashYields,
            allAvgRetroactiveYields,
            amountOfDividend,
            amountOfSuccess,
            dividendYearStart,
            dividendYearEnd
        },
        $currentDate: { updated: true },
    }, {
        upsert: true,
    });

    await redisClient.incr('STOCK_ID_INDEX');

    process.exit();
})();









Related Posts

菜鳥切版2

菜鳥切版2

vite系列#vite與Pinia整合範例

vite系列#vite與Pinia整合範例

[ 前端工具 ] - gulp, Babel, SCSS, uglyfy

[ 前端工具 ] - gulp, Babel, SCSS, uglyfy


Comments