LaunchLLM / data_aggregation /quality_validator.py
Bmccloud22's picture
Deploy LaunchLLM - Production AI Training Platform
ec8f374 verified
Raw
History Blame Contribute Delete
6.73 kB
"""
Quality Validator Module
Validates training data quality on multiple dimensions.
"""
import re
from typing import List, Dict, Any, Tuple
class QualityValidator:
"""Validate and score training data quality."""
def __init__(
self,
min_length: int = 10,
max_length: int = 5000,
min_score: float = 60.0
):
"""
Initialize quality validator.
Args:
min_length: Minimum text length
max_length: Maximum text length
min_score: Minimum quality score threshold (0-100)
"""
self.min_length = min_length
self.max_length = max_length
self.min_score = min_score
def validate_example(self, example: Dict[str, Any]) -> Tuple[bool, List[str]]:
"""
Validate a single example.
Args:
example: Data example to validate
Returns:
Tuple of (is_valid, list_of_issues)
"""
issues = []
# Check required fields
if "instruction" not in example:
issues.append("Missing 'instruction' field")
if "output" not in example:
issues.append("Missing 'output' field")
if issues:
return False, issues
# Check lengths
instruction = example.get("instruction", "")
output = example.get("output", "")
if len(instruction) < 5:
issues.append("Instruction too short")
if len(output) < self.min_length:
issues.append(f"Output too short (min {self.min_length} chars)")
if len(output) > self.max_length:
issues.append(f"Output too long (max {self.max_length} chars)")
# Check for empty output
if not output.strip():
issues.append("Empty output")
return len(issues) == 0, issues
def score_example(self, example: Dict[str, Any]) -> float:
"""
Score example quality (0-100).
Scoring dimensions:
- Length appropriateness
- Completeness
- Coherence (basic checks)
Args:
example: Data example
Returns:
Quality score (0-100)
"""
score = 100.0
# Check validity first
is_valid, issues = self.validate_example(example)
if not is_valid:
score -= 20.0 * len(issues)
if score <= 0:
return 0.0
# Length scoring
output = example.get("output", "")
output_len = len(output)
if output_len < self.min_length:
score -= 20.0
elif output_len > self.max_length:
score -= 10.0
# Coherence checks
# Check for repetition
words = output.lower().split()
if len(words) > 0:
unique_ratio = len(set(words)) / len(words)
if unique_ratio < 0.3: # Too repetitive
score -= 30.0
# Check for proper sentences (basic)
sentences = re.split(r'[.!?]+', output)
valid_sentences = [s for s in sentences if len(s.strip()) > 10]
if len(valid_sentences) == 0:
score -= 20.0
# Check for gibberish (basic)
if output_len > 20:
# Check if output has reasonable word length distribution
avg_word_len = sum(len(w) for w in words) / max(len(words), 1)
if avg_word_len > 15 or avg_word_len < 2: # Likely gibberish
score -= 25.0
return max(0.0, min(100.0, score))
def validate_batch(
self,
data: List[Dict[str, Any]],
verbose: bool = False
) -> Dict[str, Any]:
"""
Validate a batch of examples.
Args:
data: List of data examples
verbose: Print detailed validation info
Returns:
Validation results dict
"""
valid_data = []
invalid_data = []
scores = []
for i, example in enumerate(data):
is_valid, issues = self.validate_example(example)
score = self.score_example(example)
scores.append(score)
if is_valid and score >= self.min_score:
valid_data.append(example)
else:
invalid_data.append({
"example": example,
"issues": issues,
"score": score
})
if verbose:
print(f"Example {i} failed validation (score: {score:.1f})")
for issue in issues:
print(f" - {issue}")
avg_score = sum(scores) / len(scores) if scores else 0.0
results = {
"total": len(data),
"valid": len(valid_data),
"invalid": len(invalid_data),
"pass_rate": len(valid_data) / len(data) if data else 0.0,
"avg_score": avg_score,
"valid_data": valid_data,
"invalid_data": invalid_data
}
if verbose:
print(f"\n✅ Validation complete:")
print(f" Total: {results['total']}")
print(f" Valid: {results['valid']}")
print(f" Invalid: {results['invalid']}")
print(f" Pass rate: {results['pass_rate']*100:.1f}%")
print(f" Avg score: {avg_score:.1f}")
return results
def filter_data(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Filter data, keeping only valid examples.
Args:
data: List of data examples
Returns:
Filtered valid data
"""
results = self.validate_batch(data)
return results["valid_data"]
def get_quality_report(self, data: List[Dict[str, Any]]) -> str:
"""
Generate a quality report for the data.
Args:
data: List of data examples
Returns:
Formatted quality report
"""
results = self.validate_batch(data)
report = f"""
DATA QUALITY REPORT
==================
Total Examples: {results['total']}
Valid Examples: {results['valid']}
Invalid Examples: {results['invalid']}
Pass Rate: {results['pass_rate']*100:.1f}%
Average Quality Score: {results['avg_score']:.1f}/100
"""
if results['invalid_data']:
report += "COMMON ISSUES:\n"
issue_counts = {}
for item in results['invalid_data']:
for issue in item['issues']:
issue_counts[issue] = issue_counts.get(issue, 0) + 1
for issue, count in sorted(issue_counts.items(), key=lambda x: x[1], reverse=True):
report += f" - {issue}: {count} examples\n"
return report