File size: 4,280 Bytes
463c2c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import wikipediaapi
import json
from tqdm import tqdm
import time

def get_wiki_pages(categories=["Azərbaycan tarixi", "Azərbaycan mədəniyyəti", 
                             "Azərbaycan ədəbiyyatı", "Azərbaycan coğrafiyası"], 
                  min_length=500, max_pages=1000):
    """
    Recursively collect substantial Azerbaijani Wikipedia pages from multiple categories
    """
    wiki = wikipediaapi.Wikipedia(
        language='az',
        extract_format=wikipediaapi.ExtractFormat.WIKI,
        user_agent='AzGPTDataCollector/1.0'
    )
    
    collected_pages = {}
    visited_pages = set()
    
    def collect_pages(category_title):
        if len(collected_pages) >= max_pages:
            return
            
        category = wiki.page(f"Kateqoriya:{category_title}")
        if not category.exists():
            print(f"Category not found: {category_title}")
            return
            
        # First, process all articles in this category
        for member in category.categorymembers.values():
            if len(collected_pages) >= max_pages:
                return
                
            if member.title in visited_pages:
                continue
                
            visited_pages.add(member.title)
            
            # Skip if it's a category or template page
            if member.title.startswith('Kateqoriya:') or member.title.startswith('Şablon:'):
                continue
                
            # Skip if content is too short
            if len(member.text) < min_length:
                continue
                
            collected_pages[member.title] = {
                'title': member.title,
                'text': member.text,
                'url': member.fullurl,
                'length': len(member.text)
            }
            print(f"Collected: {member.title} ({len(member.text)} chars)")
            
            # Delay to avoid hitting API limits
            time.sleep(0.1)
        
        # Then process subcategories
        for subcategory in category.categorymembers.values():
            if subcategory.title.startswith('Kateqoriya:'):
                collect_pages(subcategory.title.replace('Kateqoriya:', ''))
                
    # Start collection from each category
    for category in categories:
        print(f"\nStarting collection from category: {category}")
        collect_pages(category)
    
    return collected_pages

def preprocess_text(text):
    """
    Enhanced text preprocessing for Azerbaijani text
    """
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Add space after punctuation if missing
    for punct in '.!?،؛:()[]{}«»':
        text = text.replace(punct, punct + ' ')
    
    # Fix common OCR errors in Azerbaijani text
    replacements = {
        'i': 'ı',  # Replace dotted i with dotless ı where appropriate
        'І': 'I',
        '...': '…',
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    
    return text

def save_dataset(pages, output_file='az_wiki_data.json'):
    """
    Save collected pages to a JSON file
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(pages, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(pages)} pages to {output_file}")

def main():
    # Collect pages with minimum length requirement
    print("Starting data collection...")
    pages = get_wiki_pages(min_length=500, max_pages=100)  # 500 chars minimum length
    
    # Preprocess and save
    print("\nPreprocessing and saving data...")
    for title in pages:
        pages[title]['text'] = preprocess_text(pages[title]['text'])
    
    save_dataset(pages)
    
    # Print statistics
    total_chars = sum(page['length'] for page in pages.values())
    if pages:
        print(f"\nCollection complete!")
        print(f"Total pages: {len(pages)}")
        print(f"Total characters: {total_chars}")
        print(f"Average page length: {total_chars / len(pages):.2f} characters")
        
        # Print some titles as examples
        print("\nSample of collected articles:")
        for title in list(pages.keys())[:5]:
            print(f"- {title} ({pages[title]['length']} chars)")

if __name__ == "__main__":
    main()