See the scenario below.
It is quite typical in data warehouses, to access data via dimension tables.
This is an example for a date dimension.
-- row generator
create table rgen
select 1 as n union select 2 union select 3 union select 4 union
select 5 union select 6 union select 7 union select 8 union
select 9 union select 10;
-- fact table: 1 million rows
select @a:=0;
create table r1tst
select
@a:=@a + 1 as rkey,
floor(@a/10000) as rbatchid,
floor(rand()*10) + 1 as rtype,
adddate('2010-01-01', interval (@a/4000) day) as rdate,
(rand()*1000000/100) as rsum
from rgen a,rgen b,rgen c,rgen d,rgen e,rgen f;
alter table r1tst add index(rbatchid),
add index(rdate);
-- date dimension
select @d:=-1;
create table r_date_dim select
@d:=@d+1 as date_key,
adddate('2010-01-01', interval @d day) as date_dt,
DAY(adddate('2010-01-01', interval @d day)) as date_day,
DAYNAME(adddate('2010-01-01', interval @d day)) as date_day_name,
DAYOFMONTH(adddate('2010-01-01', interval @d day)) as date_day_of_month,
DAYOFWEEK(adddate('2010-01-01', interval @d day)) as date_day_of_week,
DAYOFYEAR(adddate('2010-01-01', interval @d day)) as date_day_of_year,
LAST_DAY(adddate('2010-01-01', interval @d day)) as date_last_day,
MONTH(adddate('2010-01-01', interval @d day)) as date_month,
MONTHNAME(adddate('2010-01-01', interval @d day)) as date_month_name,
QUARTER(adddate('2010-01-01', interval @d day)) as date_quarter,
TO_DAYS(adddate('2010-01-01', interval @d day)) as date_days,
WEEK(adddate('2010-01-01', interval @d day)) as date_week,
WEEKDAY(adddate('2010-01-01', interval @d day)) as date_week_day,
WEEKOFYEAR(adddate('2010-01-01', interval @d day)) as date_week_of_year,
YEAR(adddate('2010-01-01', interval @d day)) as date_year,
YEARWEEK(adddate('2010-01-01', interval @d day)) as year_week
from rgen a,rgen b, rgen c;
alter table r_date_dim add index (date_year),
add index (date_month), add index(date_dt),
add index (date_key);
-- query A:
mysql> select sum(rsum) from r1tst v1,r_date_dim v2
-> where v1.rdate=v2.date_dt and v2.date_year=2010 and v2.date_quarter=2;
+-----------------+
| sum(rsum) |
+-----------------+
| 1819518419.8031 |
+-----------------+
1 row in set (37.64 sec)
mysql> explain select sum(rsum) from r1tst v1,r_date_dim v2
-> where v1.rdate=v2.date_dt and v2.date_year=2010 and v2.date_quarter=2;
+--+------+---+----+-----------------+-------+----+--------+-------+-----------+
|id|select|tab|type|possible_keys |key | len| ref |rows |Extra |
+--+------+---+----+-----------------+-------+----+--------+-------+-----------+
| 1|SIMPLE|v1 |ALL |rdate |NULL |NULL|NULL |1000000| |
| 1|SIMPLE|v2 |ref |date_year,date_dt|date_dt|9 |v1.rdate| 1|Using where|
+--+------+---+----+-----------------+-------+----+--------+-------+-----------+
2 rows in set (0.03 sec)
-- query B:
mysql> select sum(rsum) from r1tst v1,r_date_dim v2
-> where v1.rdate=date(v2.date_dt) and v2.date_year=2010 and v2.date_quarter=2;
+-----------------+
| sum(rsum) |
+-----------------+
| 1819518419.8031 |
+-----------------+
1 row in set (5.41 sec)
mysql> explain select sum(rsum) from r1tst v1,r_date_dim v2
-> where v1.rdate=date(v2.date_dt) and v2.date_year=2010 and v2.date_quarter=2;
+--+------+---+----+---------+---------+----+-----+----+-----------+
|id|select|tab|type|possible |key | len| ref |rows|Extra |
+--+------+---+----+---------+---------+----+-----+----+-----------+
| 1|SIMPLE| v2|ref |date_year|date_year| 5 |const| 361|Using where|
| 1|SIMPLE| v1|ref |rdate |rdate | 9 | func|3984|Using where|
+--+------+---+----+---------+---------+----+-----+----+-----------+
2 rows in set (0.00 sec)
In theory it should be a really bad idea having a function around the the join criteria, as it prevents index usage.
Why is query B faster than query A?
(Find all correct answers)
a) There is no good index on the date-dimension, so the optimizer thinks A is the best plan.
b) In B, the date() in the join forces a certain join order.
c) Restricting on the fact table directly is more efficient than going via date dimension.
[ Basic Optimizations (10%) - Overview of Optimization Principles ]