-
Notifications
You must be signed in to change notification settings - Fork 83
Expand file tree
/
Copy pathfiltering_issues.py
More file actions
160 lines (139 loc) · 5.73 KB
/
filtering_issues.py
File metadata and controls
160 lines (139 loc) · 5.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""Filtering GitHub issues dataset"""
import logging
import time
from functools import partial
from datasets import load_dataset
from datasets.utils.logging import set_verbosity_warning
from transformers import HfArgumentParser
from arguments import FilteringArguments
from utils.manual_sharding import save_manual_shards
from utils.utils_issues import (filter_on_users_size, merge_text_columns,
remove_bot_comments, replace_usernames,
strip_automated_email_text,
truncate_long_comments)
MIN_CHARS = 200
MAX_CHARS = 7000
MAX_EVENTS = 10
MAX_LINES = 80
def parse_args():
parser = HfArgumentParser(FilteringArguments)
return parser.parse_args()
def log_stats(logger, filter_name, old_size, new_size, old_size_gb, new_size_gb):
logger.info(
f"Dataset size before {filter_name}: {old_size} issues, total text in events is {old_size_gb/ 1e9:.2f} GB"
)
logger.info(
f"Dataset size after {filter_name}: {new_size} issues, total text in events is {new_size_gb/ 1e9:.2f} GB"
)
logger.info(
f"Percentage filtered issues {100 * (old_size - new_size) / old_size:.2f}%"
)
logger.info(
f"Percentage filtered volume of text {100 * (old_size_gb - new_size_gb) / old_size_gb:.2f}%"
)
def preprocess(logger, args):
# Load dataset
t_start = time.time()
logger.info(f" ===== Loading {args.dataset_name} and subset {args.subset}=====")
dataset = load_dataset(
args.dataset_name,
split=args.split,
data_dir=args.subset,
use_auth_token=True,
num_proc=args.num_workers,
)
logger.info(f"Dataset {dataset}\nLoaded in {time.time() - t_start:.2f} seconds")
# basic processing
logger.info(f"===== Basic processing dataset=====")
dataset = (
dataset.map(merge_text_columns, num_proc=args.num_workers)
.map(strip_automated_email_text, num_proc=args.num_workers)
.map(
lambda x: {"text_size": sum([len(event["text"]) for event in x["events"]])},
num_proc=args.num_workers,
)
)
old_size = len(dataset)
old_size_gb = sum(dataset["text_size"])
logger.info(
f"Dataset size before any filtering: {old_size} issues, total text in events is {old_size_gb/ 1e9:.2f} GB"
)
# truncate long comments
logger.info(f"===== Truncating long comments =====")
dataset = dataset.map(
partial(truncate_long_comments, max_lines=MAX_LINES), num_proc=args.num_workers
).map(
lambda x: {"text_size": sum([len(event["text"]) for event in x["events"]])},
)
new_size_gb = sum(dataset["text_size"])
log_stats(
logger, "truncating long comments", old_size, old_size, old_size_gb, new_size_gb
)
# bot filter
logger.info(f"===== Filtering comments from bots =====")
dataset = dataset.map(remove_bot_comments, num_proc=args.num_workers)
dataset = dataset.filter(lambda x: not x["bot_issue"])
dataset = dataset.map(
lambda x: {"text_size": sum([len(event["text"]) for event in x["events"]])},
num_proc=args.num_workers,
)
new_size = len(dataset)
new_size_gb = sum(dataset["text_size"])
log_stats(logger, "bots filter", old_size, new_size, old_size_gb, new_size_gb)
logger.info(f"===== Adding users and events count columns =====")
dataset = dataset.map(
lambda x: {"user_count": len(set(event["author"] for event in x["events"]))},
num_proc=args.num_workers,
).map(lambda x: {"event_count": len(x["events"])}, num_proc=args.num_workers)
logger.info(f"===== Filtering issues based on users and minimal size=====")
dataset = dataset.filter(
partial(
filter_on_users_size,
minimum=MIN_CHARS,
maximum=MAX_CHARS,
max_events=MAX_EVENTS,
)
)
size_users = len(dataset)
size_users_gb = sum(dataset["text_size"])
log_stats(logger, "users filter", new_size, size_users, new_size_gb, size_users_gb)
# replace usernames
logger.info(f"===== Replacing usernames =====")
dataset = dataset.map(replace_usernames, num_proc=args.num_workers)
modified_data = dataset.filter(lambda x: x["modified_usernames"])
logger.info(
f"Percentage of issues with modified usernames: {(len(modified_data) * 100) / len(dataset):.2f}%"
)
logger.info(
f"Final dataset has {size_users} samples and {size_users_gb / 1e9:.2f} GB of code.\
Equivalent to removal of {100 * (old_size - size_users) / old_size:.2f}% of issues.\
and {100 * (old_size_gb - size_users_gb) / old_size_gb:.2f}% of text."
)
logger.info(f"Dataset processed in {time.time() - t_start:.2f} seconds")
return dataset
if __name__ == "__main__":
args = parse_args()
set_verbosity_warning()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
handlers=[logging.FileHandler("filtering.log"), logging.StreamHandler()],
)
logger.info(f"Filtering GitHub issues dataset, arguments: **\n{args}")
dataset = preprocess(logger, args)
# Save dataset
t_start = time.time()
if args.push_to_hub:
logger.info(f"Pushing dataset to the Hub at {args.remote_repo}")
dataset.push_to_hub(args.remote_repo)
else:
print(
f"Saving the dataset in manual shards in a clone of {args.hub_username + args.remote_repo}"
)
save_manual_shards(
dataset, user=args.hub_username, remote_dataset_repo=args.remote_repo
)
logger.info(f"Dataset successfully saved in {time.time() - t_start:.2f} seconds")