In [1]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id="hendrick-chong-02/malaysian-chinese-youtube", 
                  allow_patterns=["2025-02-06 01:30:41/*"],
                  repo_type = 'dataset', local_dir = './')

  from .autonotebook import tqdm as notebook_tqdm
Fetching 209 files:   0%|▎                                                                            | 1/209 [00:51<2:58:40, 51.54s/it]

KeyboardInterrupt



In [44]:
import pandas as pd
from glob import glob
from torch.utils.data import DataLoader, Dataset
from datasets import Audio
import pyarrow.parquet as pq

def get_parquet_row_count(file_path):
    parquet_file = pq.ParquetFile(file_path)
    return parquet_file.metadata.num_rows
    
files = sorted(glob('*/*.parquet'))
len(files)

59

In [60]:
%%time

global_indices = {}
start = 0
for f in files:
    row_size = get_parquet_row_count(f)
    row = {
        'start': start,
        'end': row_size,
        'filename': f
    }
    row['start'] = start
    row['end'] = row_size
    global_indices[start] = row
    start += row_size

CPU times: user 9.11 ms, sys: 3.13 ms, total: 12.2 ms
Wall time: 11.3 ms


In [55]:
class ParquetDataset(Dataset):
    def __init__(self, indices, maxlen_cache_df=5):
        self.indices = {}
        for k, v in indices.items():
            for i in range(int(k), v['start'] + v['end'], 1):
                self.indices[i] = v
        
        self.max_index = len(self.indices)
        self.cache_df = {}
        self.maxlen_cache_df = maxlen_cache_df
        self.audio = Audio(sampling_rate=16000)
    
    def __len__(self):
        return self.max_index
    
    def __getitem__(self, item):
        if item < 0:
            item = self.max_index + item

        v = self.indices[item]
        chunk_index = item - v['start']
        if v['filename'] not in self.cache_df:
            df = pd.read_parquet(v['filename'])
            if len(self.cache_df) >= self.maxlen_cache_df:
                keys = list(self.cache_df.keys())
                self.cache_df.pop(sorted(keys)[0], None)
            self.cache_df[v['filename']] = df
        else:
            df = self.cache_df[v['filename']]

        row = df.iloc[chunk_index].to_dict()
        audio = k = self.audio.decode_example(self.audio.encode_example(row['audio']))['array']
        row['audio'] = audio
        return row

In [61]:
dataset = ParquetDataset(global_indices)
len(dataset)

35833

In [62]:
%%time

dataset[1000]

CPU times: user 223 ms, sys: 1.56 s, total: 1.78 s
Wall time: 812 ms


{'audio': array([-0.00125122, -0.00079346,  0.        , ..., -0.00183105,
         0.00012207,  0.00296021]),
 'video id': '18RUjXWtV28',
 'chunk number': 0,
 'transcription': '我前,身份在身边好啦,是Good morning!今天是一个非常开心的一天看我春风满面就知道最近发生了两件很重要的事情要跟大家分享No.1Hepi在上个星期拿了人生第一张Driving license那一个下午当下我直接去买男生第一辆车买了新车回到家的第一个moment我打开门发生第二件很值得开心的事情Hepi一进家就发现一个包裹一打开包裹你知道是什么吗竟然是全新的Sony A7C',
 'timestamp': "[{'timestamp': (0.0, 1.54), 'text': '我前,身份在身边'}, {'timestamp': (1.54, 2.3), 'text': '好啦,是'}, {'timestamp': (2.3, 3.6), 'text': 'Good morning!'}, {'timestamp': (3.6, 5.3), 'text': '今天是一个非常开心的一天'}, {'timestamp': (5.3, 7.6), 'text': '看我春风满面就知道最近发生了'}, {'timestamp': (7.6, 9.6), 'text': '两件很重要的事情要跟大家分享'}, {'timestamp': (9.6, 10.14), 'text': 'No.1'}, {'timestamp': (10.14, 11.06), 'text': 'Hepi在上个星期'}, {'timestamp': (11.06, 12.56), 'text': '拿了人生第一张Driving license'}, {'timestamp': (12.56, 13.34), 'text': '那一个下午'}, {'timestamp': (13.34, 14.06), 'text': '当下'}, {'timestamp': (14.06, 14.64), 'text': '我直接'}, {'timestam

In [63]:
%%time

dataset[1001]

CPU times: user 4.19 ms, sys: 2.22 ms, total: 6.41 ms
Wall time: 5.41 ms


{'audio': array([ 7.32421875e-04, -6.10351562e-05, -1.12915039e-03, ...,
        -5.13000488e-02, -4.47082520e-02, -3.10974121e-02]),
 'video id': '18RUjXWtV28',
 'chunk number': 1,
 'transcription': '我只能講我太開心Sony來得太對時機了所以這一次的VlogHabby決定使用這台全新的A7C一邊拍攝一邊開上我的新車哈哈這樣子的話你可以在同一支影片裡面看到兩個的小寶貝這樣啊Ida cutBefore看車之前呢Habby先把今天這一集最廢的嘉賓請出來她是人稱書棒區美女收割機很像換女朋友喂喂哈哈哈哈女朋友你刷Credit看來會快書家說好壞',
 'timestamp': "[{'timestamp': (0.0, 1.6), 'text': '我只能講我太開心'}, {'timestamp': (1.6, 3.76), 'text': 'Sony來得太對時機了'}, {'timestamp': (3.76, 4.68), 'text': '所以這一次的Vlog'}, {'timestamp': (4.68, 7.5), 'text': 'Habby決定使用這台全新的A7C'}, {'timestamp': (7.5, 8.3), 'text': '一邊拍攝'}, {'timestamp': (8.3, 9.76), 'text': '一邊開上我的新車'}, {'timestamp': (9.76, 10.34), 'text': '哈哈'}, {'timestamp': (10.34, 11.0), 'text': '這樣子的話'}, {'timestamp': (11.0, 12.96), 'text': '你可以在同一支影片裡面看到'}, {'timestamp': (12.96, 14.5), 'text': '兩個的小寶貝'}, {'timestamp': (14.5, 15.2), 'text': '這樣啊'}, {'timestamp': (15.2, 15.94), 'text': 'Ida cut'}, {'timestamp': (15.94, 17.26