# πŸš€ HRHUB - Bilateral Matching System

## 🎯 Mathematical Framework:

```
Candidate ∈ ℝⁿ (multidimensional vector)
Company ∈ ℝⁿ (multidimensional vector)

Both live in the SAME vector space!

Match Score = cosine_similarity(v_candidate, v_company)
```

## πŸ“Š Dataset:
- **9,544 candidates** (35 dimensions)
- **180,000 companies** (multiple dimensions from merged data)

---

## πŸ“¦ Step 1: Install & Import

In [None]:
!pip install -q sentence-transformers plotly anthropic

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

print("βœ… Ready!")

## πŸ“‚ Step 2: Load & Merge Company Data

Building rich 180K company entities by merging multiple tables.

In [None]:
print("πŸ“‚ Loading company datasets...\n")

# Load base companies table
companies_base = pd.read_csv('companies/companies.csv')
print(f"βœ… Base companies: {len(companies_base):,} rows")

# Load additional company dimensions
company_industries = pd.read_csv('companies/company_industries.csv')
print(f"βœ… Company industries: {len(company_industries):,} rows")

company_specialties = pd.read_csv('companies/company_specialties.csv')
print(f"βœ… Company specialties: {len(company_specialties):,} rows")

employee_counts = pd.read_csv('companies/employee_counts.csv')
print(f"βœ… Employee counts: {len(employee_counts):,} rows")

# Load mappings (for reference)
industries_map = pd.read_csv('mappings/industries.csv')
skills_map = pd.read_csv('mappings/skills.csv')
print(f"βœ… Mappings loaded")

print(f"\nπŸ“Š Base company columns: {companies_base.columns.tolist()}")

## πŸ”— Step 3: Merge Company Data (Create Rich Entities)

Aggregate multiple dimensions into single company profile.

In [None]:
print("πŸ”— Merging company data...\n")

# Aggregate industries per company (many-to-many)
company_industries_agg = company_industries.groupby('company_id')['industry_id'].apply(
 lambda x: ', '.join(map(str, x.tolist()))
).reset_index()
company_industries_agg.columns = ['company_id', 'industries_list']

print(f"βœ… Aggregated industries for {len(company_industries_agg):,} companies")

# Aggregate specialties per company
company_specialties_agg = company_specialties.groupby('company_id')['specialty'].apply(
 lambda x: ' | '.join(x.tolist())
).reset_index()
company_specialties_agg.columns = ['company_id', 'specialties_list']

print(f"βœ… Aggregated specialties for {len(company_specialties_agg):,} companies")

# Merge everything into companies_base
companies_full = companies_base.copy()

# Merge industries
companies_full = companies_full.merge(
 company_industries_agg, 
 on='company_id', 
 how='left'
)

# Merge specialties
companies_full = companies_full.merge(
 company_specialties_agg, 
 on='company_id', 
 how='left'
)

# Merge employee counts
companies_full = companies_full.merge(
 employee_counts, 
 on='company_id', 
 how='left'
)

# Fill NaN
companies_full = companies_full.fillna('')

print(f"\nβœ… MERGED DATASET CREATED!")
print(f"πŸ“Š Final companies: {len(companies_full):,} rows Γ— {len(companies_full.columns)} columns")
print(f"\nπŸ“‹ Columns: {companies_full.columns.tolist()}")

# Show sample
print(f"\nπŸ‘€ Sample company:")
companies_full.head(3)

## πŸ“‚ Step 4: Load Candidates

In [None]:
# Load candidates
candidates = pd.read_csv('resume_data.csv')
candidates = candidates.fillna('')

print(f"βœ… Loaded {len(candidates):,} candidates Γ— {len(candidates.columns)} columns")
print(f"\nπŸ“‹ Candidate columns: {candidates.columns.tolist()[:10]}...")
candidates.head(3)

## πŸ“ Step 5: Create Text Representations (ℝⁿ preparation)

Transform structured data β†’ unified text β†’ embeddings β†’ vectors ∈ ℝⁿ

In [None]:
print("πŸ“ Creating text representations...\n")

# Candidate text
def make_candidate_text(row):
 parts = []
 
 if row.get('skills'): 
 parts.append(f"Skills: {row['skills']}")
 if row.get('career_objective'): 
 parts.append(f"Objective: {row['career_objective']}")
 if row.get('educational_institution_name'): 
 parts.append(f"Education: {row['educational_institution_name']}")
 if row.get('degree_names'): 
 parts.append(f"Degree: {row['degree_names']}")
 if row.get('major_field_of_studies'): 
 parts.append(f"Field: {row['major_field_of_studies']}")
 if row.get('positions'): 
 parts.append(f"Experience: {row['positions']}")
 if row.get('responsibilities'): 
 parts.append(f"Responsibilities: {str(row['responsibilities'])[:200]}")
 
 return ' | '.join(parts) if parts else "No info"

# Company text (from merged data!)
def make_company_text(row):
 parts = []
 
 if row.get('name'): 
 parts.append(f"Company: {row['name']}")
 if row.get('description'): 
 parts.append(f"Description: {str(row['description'])[:300]}")
 if row.get('industries_list'): 
 parts.append(f"Industries: {row['industries_list']}")
 if row.get('specialties_list'): 
 parts.append(f"Specialties: {row['specialties_list']}")
 if row.get('employee_count'): 
 parts.append(f"Size: {row['employee_count']} employees")
 if row.get('follower_count'): 
 parts.append(f"Followers: {row['follower_count']}")
 if row.get('city') or row.get('state') or row.get('country'): 
 loc = f"{row.get('city', '')}, {row.get('state', '')}, {row.get('country', '')}"
 parts.append(f"Location: {loc}")
 
 return ' | '.join(parts) if parts else "No info"

# Apply
candidates['text'] = candidates.apply(make_candidate_text, axis=1)
companies_full['text'] = companies_full.apply(make_company_text, axis=1)

print("βœ… Text created!")
print(f"\nπŸ“„ Sample candidate text:\n{candidates['text'].iloc[0][:300]}...")
print(f"\nπŸ“„ Sample company text:\n{companies_full['text'].iloc[0][:300]}...")

## 🧠 Step 6: Generate Embeddings (Transform to ℝⁿ)

**CRITICAL:** This creates vectors in the SAME mathematical space!

In [None]:
print("🧠 Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2') # Creates 384-dim vectors

print(f"βœ… Model loaded! Embedding dimension: {model.get_sentence_embedding_dimension()}")
print(f"\nπŸ”„ Generating candidate vectors (this may take a few minutes)...")
cand_vectors = model.encode(candidates['text'].tolist(), show_progress_bar=True)

print(f"\nπŸ”„ Generating company vectors (180K companies - this will take time!)...")
comp_vectors = model.encode(companies_full['text'].tolist(), show_progress_bar=True, batch_size=64)

print(f"\nβœ… VECTORS CREATED!")
print(f"πŸ“Š Candidate vectors: {cand_vectors.shape}")
print(f"πŸ“Š Company vectors: {comp_vectors.shape}")
print(f"\n🎯 Both live in ℝ^{model.get_sentence_embedding_dimension()} !")

## 🎯 Step 7: Matching Engine (Cosine Similarity)

In [None]:
def cosine_similarity(a, b):
 """Calculate cosine similarity between two vectors."""
 return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def find_top_matches(candidate_idx, top_k=10):
 """
 Find top K company matches for a candidate.
 
 Returns: List of (company_idx, similarity_score)
 """
 cand_vec = cand_vectors[candidate_idx]
 
 # Calculate similarities with ALL 180K companies
 scores = []
 for i, comp_vec in enumerate(comp_vectors):
 score = cosine_similarity(cand_vec, comp_vec)
 scores.append((i, score))
 
 # Sort by score (descending)
 scores.sort(key=lambda x: x[1], reverse=True)
 
 return scores[:top_k]

print("βœ… Matching engine ready!")
print(f"πŸ“Š Ready to match {len(candidates):,} candidates with {len(companies_full):,} companies!")

## πŸ” Step 8: Test - Find Matches for Candidate #0

In [None]:
print("πŸ” Finding top 10 matches for Candidate #0...\n")

matches = find_top_matches(0, top_k=10)

print("🎯 Top 10 Company Matches:\n")
print("=" * 80)
print(f"{'Rank':<6} {'Score':<8} {'Company Name':<40} {'Industry'}")
print("=" * 80)

for rank, (comp_idx, score) in enumerate(matches, 1):
 company_name = companies_full.iloc[comp_idx].get('name', 'N/A')[:40]
 industry = companies_full.iloc[comp_idx].get('industries_list', 'N/A')[:30]
 print(f"{rank:<6} {score:.4f} {company_name:<40} {industry}")

print("=" * 80)

## πŸ“Š Step 9: Visualize Match Distribution

In [None]:
# Get scores for sample
all_scores = []
sample_size = min(100, len(candidates))

print(f"πŸ“Š Computing match scores for {sample_size} candidates...")

for i in range(sample_size):
 if i % 20 == 0:
 print(f" Progress: {i}/{sample_size}")
 matches = find_top_matches(i, top_k=10)
 for comp_idx, score in matches:
 all_scores.append(score)

# Plot
fig = px.histogram(
 x=all_scores,
 nbins=50,
 title=f'Distribution of Match Scores ({len(candidates):,} candidates Γ— {len(companies_full):,} companies)',
 labels={'x': 'Cosine Similarity Score'}
)
fig.show()

print(f"\nπŸ“Š Statistics:")
print(f" Mean: {np.mean(all_scores):.4f}")
print(f" Median: {np.median(all_scores):.4f}")
print(f" Std: {np.std(all_scores):.4f}")
print(f" Max: {np.max(all_scores):.4f}")

## πŸ’Ύ Step 10: Export Results

In [None]:
# Generate matches for sample
results = []
export_sample = min(500, len(candidates)) # Export matches for 500 candidates

print(f"πŸ’Ύ Generating matches for {export_sample} candidates...\n")

for i in range(export_sample):
 if i % 50 == 0:
 print(f" Progress: {i}/{export_sample}")
 
 matches = find_top_matches(i, top_k=10)
 
 for rank, (comp_idx, score) in enumerate(matches, 1):
 results.append({
 'candidate_id': i,
 'company_id': companies_full.iloc[comp_idx].get('company_id'),
 'company_name': companies_full.iloc[comp_idx].get('name', 'N/A'),
 'rank': rank,
 'similarity_score': float(score),
 'industry': companies_full.iloc[comp_idx].get('industries_list', 'N/A')[:50]
 })

# Create DataFrame
results_df = pd.DataFrame(results)
results_df.to_csv('hrhub_matches.csv', index=False)

print(f"\nβœ… Exported {len(results_df):,} matches to hrhub_matches.csv")
print(f"\nπŸ‘€ Preview:")
results_df.head(20)

## πŸŽ‰ DONE!

### βœ… What you have:
- **9,544 candidates** as vectors ∈ ℝ³⁸⁴
- **180,000 companies** as vectors ∈ ℝ³⁸⁴
- Both in the SAME mathematical space!
- Cosine similarity matching
- Exported results

### πŸš€ Next steps:
1. Add LLM explanations (optional - needs API key)
2. Implement user weights for dimensions
3. Build UI/API on top